
2    ******************************************************************************************
2    *************
3    This code calculates two intermediate ownership variables used to identify sister firms,
3    including:
4    the percentage of a firms shares held by an institutional investor, and
5    the percentage of an institutional investors equity portfolio invested in a firm.
6    
7    *The code is adapted from:
8    *Luis Palacios, Rabih Moussawi, and Denys Glushkov (2009), available through WRDS Research
8     Applications.
9    *Modifications were made for our sample coverage and definitions
10   ******************************************************************************************
10   *************;
11   
12   libname input 'D:\sas data\input';
NOTE: Libref INPUT was successfully assigned as follows: 
      Engine:        V9 
      Physical Name: D:\sas data\input
13   libname wip 'D:\sas data\wip';
NOTE: Libref WIP was successfully assigned as follows: 
      Engine:        V9 
      Physical Name: D:\sas data\wip
14   libname output 'D:\sas data\output';
NOTE: Libref OUTPUT was successfully assigned as follows: 
      Engine:        V9 
      Physical Name: D:\sas data\output
15   
16   /* Step1. Specifying options */
17   *Adjust share and price in monthly data;
18   *and since Thomson 13F is quarterly (FDATE & RDATE);
19   *align CRSP month-end dates and keep quarter ends;
20   *crsp_m is the CRSP monthly data file;
21   data wip.crsp_m; format QDATE date9.;
22   set input.crsp_m;
23   QDATE = INTNX('QTR',date,0,'E');
24   DATE = INTNX("MONTH",date,0,"E");
25   P = abs(prc)/cfacpr;
26   TSO=shrout*cfacshr*1000;
27   if TSO<=0 then TSO=.;
28   ME = P*TSO/1000000;
29   label P = "Price at Period End, Adjusted";
30   label TSO = "Total Shares Outstanding, Adjusted";
31   label ME = "Market Capitalization, x$1m";
32   drop ncusip prc cfacpr shrout exchcd shrcd ret;
33   format ret percentn8.4 ME P dollar12.3 TSO comma12.;
34   run;

NOTE: There were 3282906 observations read from the data set INPUT.CRSP_M.
NOTE: The data set WIP.CRSP_M has 3282906 observations and 7 variables.
NOTE: DATA statement used (Total process time):
      real time           1.75 seconds
      cpu time            1.53 seconds
      

35   
36   *Keep last monthly observation for each quarter;
37   data wip.crsp_m;
38   set wip.crsp_m;
39   by permno qdate date;
40   if last.qdate;
41   drop date;
42   run;

NOTE: There were 3282906 observations read from the data set WIP.CRSP_M.
NOTE: The data set WIP.CRSP_M has 1112338 observations and 6 variables.
NOTE: DATA statement used (Total process time):
      real time           0.38 seconds
      cpu time            0.31 seconds
      

43   
44   /* Step2. Merge TR-13F S34type1 and S34type3 sets */
45   *First, keep first vintage with holdings data for each RDATE-MGRNO combinations;
46   *s34type1 is from Thomson Reuters on WRDS;
47   proc sql;
48   create table wip.First_Vint
49   as select distinct rdate, fdate, mgrno, mgrname,TYPECODE
50   from input.s34type1
51   group by mgrno, rdate
52   having fdate=min(fdate)
53   order by mgrno, rdate;
NOTE: The query requires remerging summary statistics back with the original data.
NOTE: Table WIP.FIRST_VINT created, with 256163 rows and 5 columns.

54   quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           0.25 seconds
      cpu time            0.23 seconds
      

55   
56   *Marker for first and last quarters of reporting & reporting gaps;
57   data wip.First_Vint;
58   set wip.First_Vint;
59   by mgrno rdate;
60   length First_Report 3;
61   First_Report = (first.mgrno or intck("QTR",lag(rdate),rdate)>1);
62   run;

NOTE: There were 256163 observations read from the data set WIP.FIRST_VINT.
NOTE: The data set WIP.FIRST_VINT has 256163 observations and 6 variables.
NOTE: DATA statement used (Total process time):
      real time           0.10 seconds
      cpu time            0.07 seconds
      

63   
64   *Last report by institutional manager, or missing 13F reports in the next quarter(s);
65   proc sort data=wip.First_Vint nodupkey; by mgrno descending rdate; run;

NOTE: There were 256163 observations read from the data set WIP.FIRST_VINT.
NOTE: 0 observations with duplicate key values were deleted.
NOTE: The data set WIP.FIRST_VINT has 256163 observations and 6 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.08 seconds
      cpu time            0.07 seconds
      

66   
67   data wip.First_Vint;
68   set wip.First_Vint;
69   by mgrno descending rdate;
70   length Last_Report 3;
71   Last_Report = (first.mgrno or intck("QTR",rdate,lag(rdate))>1);
72   run;
 
NOTE: There were 256163 observations read from the data set WIP.FIRST_VINT.
NOTE: The data set WIP.FIRST_VINT has 256163 observations and 7 variables.
NOTE: DATA statement used (Total process time):
      real time           0.10 seconds
      cpu time            0.06 seconds
      

73   
74   *Add total number of 13F filers during each quarter;
75   proc sql undo_policy=none;
76   create table wip.First_Vint
77   as select distinct *, count(mgrno) as NumInst
78   from wip.First_Vint
79   group by rdate
80   order by fdate, mgrno;
NOTE: The query requires remerging summary statistics back with the original data.
NOTE: Table WIP.FIRST_VINT created, with 256163 rows and 8 columns.

81   quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           0.18 seconds
      cpu time            0.26 seconds
      

82   
83   *Extract Holdings and Adjust Shares Held
84   *FDATE -Vintage date- is used in shares' adjustment;
85   *s34type3 is from Thomson Reuters on WRDS;
86   proc sql;
86             create table wip.Holdings_v1  as
87    select
87   a.RDATE,a.FDATE,a.MGRNO,a.First_Report,a.Last_Report,a.NumInst,a.TYPECODE,a.MGRNAME,b.CUSI
87   P,b.SHARES
88    from wip.First_Vint as a, input.s34type3 as b
89    where a.fdate=b.fdate and a.mgrno=b.mgrno and shares>0
90    order by fdate, mgrno;
NOTE: Table WIP.HOLDINGS_V1 created, with 63025811 rows and 10 columns.

91   quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           31.96 seconds
      cpu time            34.43 seconds
      

92   
93   /* Step3. Map TR-13F's historical CUSIP to CRSP unique identifier PERMNO */
94   *Keep securities in CRSP common stock universe;
95   *crsp_msenames is the CRSP monthly event file;
96   proc sql;
97   create table wip.Holdings_v2 as
98   select distinct  a.rdate, a.fdate, a.mgrno, a.NumInst,a.MGRNAME,
99           a.first_report, a.last_report, b.permno, a.shares,a.TYPECODE
100  from wip.Holdings_v1 as a,
101     (select distinct ncusip, permno from input.crsp_msenames
102      where not missing(ncusip)) as b
103      where a.cusip=b.ncusip;
NOTE: Table WIP.HOLDINGS_V2 created, with 62249812 rows and 10 columns.

104  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           1:28.93
      cpu time            1:14.67
      

105  
106  /* Step4. Adjust shares using CRSP adjustment factors aligned at vintage dates */
107  proc sql;
108  create table wip.Holdings as
109  select distinct a.rdate,a.fdate, a.mgrno, a.MGRNAME,a.NumInst, a.first_report,
109  a.last_report,
110        a.permno, a.shares*b.cfacshr as shares_adj label = "Adjusted Shares
110  Held",a.TYPECODE,b.p,b.me
111  from wip.Holdings_v2 as a, wip.crsp_m as b
112  where a.permno=b.permno and a.fdate = b.qdate;
NOTE: Table WIP.HOLDINGS created, with 62090371 rows and 12 columns.

113  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           1:41.23
      cpu time            1:23.82
      

114  
115  proc sort data=wip.Holdings nodupkey; by permno rdate mgrno; run;

NOTE: There were 62090371 observations read from the data set WIP.HOLDINGS.
NOTE: 138 observations with duplicate key values were deleted.
NOTE: The data set WIP.HOLDINGS has 62090233 observations and 12 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           33.95 seconds
      cpu time            39.68 seconds
      

116  
117  proc sort data=wip.crsp_m   nodupkey; by permno qdate;       run;

NOTE: There were 1112338 observations read from the data set WIP.CRSP_M.
NOTE: 0 observations with duplicate key values were deleted.
NOTE: The data set WIP.CRSP_M has 1112338 observations and 6 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.28 seconds
      cpu time            0.09 seconds
      

118  
119  /* Step 5: Calculate the percentage of a firms shares held by an institutional investor
119  */
120  data wip.holding1;
121  merge wip.Holdings(in=a) wip.crsp_m (in=b rename=(qdate=rdate));
122  by permno rdate;
123  if b and TSO>0;
124  ior_a = shares_adj/TSO;
125  MV=shares_adj*p/1000000;
126  if missing(ior_a) then ior_a=0;
127  IO_MISSING = (not a);
128  IO_G1      = (ior_a>1);
129  drop CFACSHR;
130  format  ior_a percentn8.2;
131  run;
 
NOTE: There were 62090233 observations read from the data set WIP.HOLDINGS.
NOTE: There were 1112338 observations read from the data set WIP.CRSP_M.
NOTE: The data set WIP.HOLDING1 has 62189261 observations and 17 variables.
NOTE: DATA statement used (Total process time):
      real time           32.29 seconds
      cpu time            12.62 seconds
      

132  
133  proc sort data=wip.holding1 nodupkey; by permno rdate mgrno; run;

NOTE: There were 62189261 observations read from the data set WIP.HOLDING1.
NOTE: 0 observations with duplicate key values were deleted.
NOTE: The data set WIP.HOLDING1 has 62189261 observations and 17 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           47.01 seconds
      cpu time            29.95 seconds
      

134  
135  /* Step 6: Calculate the percentage of an institutional investors equity portfolio
135  invested in a firm */
136  proc sort data=wip.holding1 ; by mgrno rdate; run;

NOTE: There were 62189261 observations read from the data set WIP.HOLDING1.
NOTE: The data set WIP.HOLDING1 has 62189261 observations and 17 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           51.15 seconds
      cpu time            43.95 seconds
      

137  
138  *calculate the total market value of the securities held in an institutional investors
138  equity portfolio;
139  proc means   data=wip.holding1 noprint ;
140  by mgrno rdate ;
141  var mv;
142  output out=wip.m  sum=  /autoname;
143  run;

NOTE: There were 62189261 observations read from the data set WIP.HOLDING1.
NOTE: The data set WIP.M has 255432 observations and 5 variables.
NOTE: PROCEDURE MEANS used (Total process time):
      real time           22.85 seconds
      cpu time            7.37 seconds
      

144  
145  proc sql;
146  create table wip.holding2 as
147  select distinct a.*,b.MV_Sum
148  from wip.holding1 as a, wip.m as b
149  where a.mgrno=b.mgrno and a.rdate = b.rdate;
NOTE: Table WIP.HOLDING2 created, with 62189261 rows and 18 columns.

150  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           2:02.96
      cpu time            1:29.31
      

151  
152  data wip.holding3;
153  set wip.holding2;
154  ior_b=MV/MV_SUM;
155  if missing(ior_b) then ior_b=0;
156  format  ior_b percentn8.2;
157  run;
 
NOTE: There were 62189261 observations read from the data set WIP.HOLDING2.
NOTE: The data set WIP.HOLDING3 has 62189261 observations and 19 variables.
NOTE: DATA statement used (Total process time):
      real time           38.78 seconds
      cpu time            8.87 seconds
      

158  
159  proc sort data=wip.holding3 nodupkey; by permno rdate mgrno; run;

NOTE: There were 62189261 observations read from the data set WIP.HOLDING3.
NOTE: 0 observations with duplicate key values were deleted.
NOTE: The data set WIP.HOLDING3 has 62189261 observations and 19 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           54.31 seconds
      cpu time            43.39 seconds
      

160  
161  *Keep relevant variables;
162  proc sql;
163   create table wip.holding_final as
164   select  RDATE,MGRNO, MGRNAME,PERMNO, TYPECODE, ior_a, ior_b
165  from wip.holding3;
NOTE: Table WIP.HOLDING_FINAL created, with 62189261 rows and 7 columns.

166  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           39.29 seconds
      cpu time            7.79 seconds
      

167  
168  *Incorporate firm identifiers (gvkey) into the ownership dataset;
169  *link_permno_gvkey is the linking table between permno and gvkey obtained from the
169  CRSP/Compustat Merged database;
170  proc sql;
171  create table wip.ownership as
172  select distinct a.*,b.gvkey, b.comp_conm
173  from wip.holding_final as a, input.link_permno_gvkey as b
174  where a.permno=b.permno
175  order by mgrno, rdate, gvkey, permno;
NOTE: Table WIP.OWNERSHIP created, with 61268437 rows and 9 columns.

176  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           1:41.81
      cpu time            1:25.56
      

177  
178  *Some firms have multiple share classes
179  *ior_a is calculated as the average across those classes and ior_b is computed by summing
179  across share classes
180  *Results are robust to alternative weighting schemes;
181  proc means   data=wip.ownership noprint ;
182  by  mgrno rdate gvkey ;
183  var ior_a;
184  output out=wip.ior_a_adj  mean=  /autoname;
185  run;

NOTE: There were 61268437 observations read from the data set WIP.OWNERSHIP.
NOTE: The data set WIP.IOR_A_ADJ has 61053543 observations and 6 variables.
NOTE: PROCEDURE MEANS used (Total process time):
      real time           1:41.09
      cpu time            1:39.60
      

186  
187  proc means   data=wip.ownership noprint ;
188  by  mgrno rdate gvkey ;
189  var ior_b;
190  output out=wip.ior_b_adj  sum=  /autoname;
191  run;

NOTE: There were 61268437 observations read from the data set WIP.OWNERSHIP.
NOTE: The data set WIP.IOR_B_ADJ has 61053543 observations and 6 variables.
NOTE: PROCEDURE MEANS used (Total process time):
      real time           1:39.47
      cpu time            1:37.53
      

192  
193  proc sort data=wip.ownership  nodupkey; by mgrno rdate gvkey  ; run;

NOTE: There were 61268437 observations read from the data set WIP.OWNERSHIP.
NOTE: 214894 observations with duplicate key values were deleted.
NOTE: The data set WIP.OWNERSHIP has 61053543 observations and 9 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           41.00 seconds
      cpu time            30.32 seconds
      

194  
195  data wip.ownership;
196  set wip.ownership;
197  drop ior_a ior_b permno;
198  run;

NOTE: There were 61053543 observations read from the data set WIP.OWNERSHIP.
NOTE: The data set WIP.OWNERSHIP has 61053543 observations and 6 variables.
NOTE: DATA statement used (Total process time):
      real time           31.83 seconds
      cpu time            7.26 seconds
      

199  
200  proc sql;
201  create table wip.ownership1 as
202  select distinct a.*,b.ior_a_Mean as ior_a
203  from wip.ownership as a, wip.ior_a_adj as b
204  where a.gvkey=b.gvkey and a.rdate=b.rdate and a.mgrno=b.mgrno
205  order by comp_conm, rdate, mgrno;
NOTE: Table WIP.OWNERSHIP1 created, with 61053543 rows and 7 columns.

206  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           3:07.32
      cpu time            2:44.81
      

207  
208  proc sql;
209  create table output.ownership_final as
210  select distinct a.*,b.ior_b_Sum as ior_b
211  from wip.ownership1 as a, wip.ior_b_adj as b
212  where a.gvkey=b.gvkey and a.rdate=b.rdate and a.mgrno=b.mgrno
213  order by comp_conm, rdate, mgrno;
NOTE: Table OUTPUT.OWNERSHIP_FINAL created, with 61053543 rows and 8 columns.

214  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           3:05.11
      cpu time            2:33.78
      

215  
216  *sas to stata data conversion;
217  proc export data=output.ownership_final outfile= "D:\stata data\input\ownership_final.dta"
217  replace;
218  run;

NOTE: The export data set has 61053543 observations and 8 variables.
NOTE: "D:\stata data\input\ownership_final.dta" file was successfully created.
NOTE: PROCEDURE EXPORT used (Total process time):
      real time           24.09 seconds
      cpu time            13.50 seconds


-------------------------------------------------------------------------------

. /*===========================================================================
> ========
>  This code is to identify sister firms, defined as large investee firms that 
> are 
>  owned by large shareholders of the ESG rating agency within a given year.
>  ============================================================================
> =======*/
. 
. global input "/Volumes/T9/stata data/input"

. global wip "/Volumes/T9/stata data/wip"

. global output "/Volumes/T9/stata data/output"

. 
. set more off

. set type double

. 
. *****************************************************************
. /* Step1: Identify large shareholders of the ESG rating agency */
. *****************************************************************
. 
. *To align the timing of ownership and ESG information, we lag the ownership d
> ata by six months
. *Results are robust without the lag
. use "$input/ownership_final.dta", clear

. gen effective_date= dofm(mofd(rdate) - 6) + day(rdate) - 2

. format effective_date %td

. drop rdate

. rename effective_date rdate

. save "$input/ownership_final1.dta", replace
file /Volumes/T9/stata data/input/ownership_final1.dta saved

. 
. *Step1.1 Identify the shareholders of the ESG rating agency
. use "$input/ownership_final1.dta", clear

. keep if gvkey=="178507"
(61,047,168 observations deleted)

. keep if rdate<=date("29dec2015","DMY") & rdate>=date("29mar2010","DMY")
(1,845 observations deleted)

. gen large_shareholder=1 if ior_a>0.05
(4,417 missing values generated)

. replace large_shareholder=0 if large_shareholder==.
(4,417 real changes made)

. *The results are robust to an alternative definition of large shareholders, w
> hereby an institutional investor is classified as a large shareholder if it h
> olds at least 4% of the rater's shares.
. *gen large_shareholder=1 if ior_a>0.04
. *replace large_shareholder=0 if large_shareholder==.
. save "$wip/rater_shareholder.dta", replace
(file /Volumes/T9/stata data/wip/rater_shareholder.dta not found)
file /Volumes/T9/stata data/wip/rater_shareholder.dta saved

. 
. *Step1.2 Keep only large shareholders of the ESG rating agency
. use "$wip/rater_shareholder.dta", clear

. keep if large_shareholder==1
(4,417 observations deleted)

. sort rdate mgrno

. save "$wip/rater_large_shareholder.dta", replace
(file /Volumes/T9/stata data/wip/rater_large_shareholder.dta not found)
file /Volumes/T9/stata data/wip/rater_large_shareholder.dta saved

. 
. *Step1.3 Create a list of large shareholders and construct relevant character
> istic variables
. use "$wip/rater_large_shareholder.dta", clear

. sort  mgrno rdate

. by mgrno : gen large_quarter=_N

. duplicates drop mgrno, force

Duplicates in terms of mgrno

(99 observations deleted)

. gsort - large_quarter

. drop rdate gvkey comp_conm ior_a ior_b large_shareholder 

. 
. *Classify blockholders as transient (short-term) or dedicated (long-term)
. gen transient=1 if large_quarter<=4
(7 missing values generated)

. replace transient=0 if transient==.
(7 real changes made)

. gen dedicated=1 if large_quarter>4
(7 missing values generated)

. replace dedicated=0 if dedicated==.
(7 real changes made)

. gsort -large_quarter +mgrname

. save "$wip/rater_blockholder_list.dta", replace
(file /Volumes/T9/stata data/wip/rater_blockholder_list.dta not found)
file /Volumes/T9/stata data/wip/rater_blockholder_list.dta saved

. 
. *Identify whether an institutional investor is among the five largest or five
>  smallest blockholders of the ESG rater
. use  "$wip/rater_shareholder.dta", clear 

. merge m:1 mgrno using  "$wip/rater_blockholder_list.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                         4,265
        from master                     4,265  (_merge==1)
        from using                          0  (_merge==2)

    Matched                               265  (_merge==3)
    -----------------------------------------

. keep if _merge==3
(4,265 observations deleted)

. drop _merge

. sort large_quarter mgrno mgrname rdate

. by large_quarter mgrno mgrname: egen large_scale=mean(ior_a)

. duplicates drop mgrno, force

Duplicates in terms of mgrno

(251 observations deleted)

. keep mgrno large_scale 

. merge 1:1 mgrno using "$wip/rater_blockholder_list.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                             0
    Matched                                14  (_merge==3)
    -----------------------------------------

. drop _merge

. sort large_scale

. gen largest_blockholders=1 if large_scale>=0.057
(9 missing values generated)

. replace largest_blockholders=0 if largest_blockholders==.
(9 real changes made)

. gen smallest_blockholders=1 if large_scale<=0.0407
(9 missing values generated)

. replace smallest_blockholders=0 if smallest_blockholders==.
(9 real changes made)

. save "$output/rater_blockholder_list.dta", replace
(file /Volumes/T9/stata data/output/rater_blockholder_list.dta not found)
file /Volumes/T9/stata data/output/rater_blockholder_list.dta saved

. 
. *****************************************************************************
> ***********
. /* Step2: Identify large investee firms held by the ESG rater's large shareho
> lders    */
. *****************************************************************************
> ***********
. 
. *Step 2.1 Identify investee firms held by the ESG rater's large shareholders
. use "$wip/rater_large_shareholder.dta", clear

. drop gvkey comp_conm ior_a ior_b large_shareholder 

. merge 1:m mgrno rdate using "$input/ownership_final1.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                    60,858,641
        from master                         0  (_merge==1)
        from using                 60,858,641  (_merge==2)

    Matched                           194,902  (_merge==3)
    -----------------------------------------

. keep if _merge==3
(60,858,641 observations deleted)

. drop _merge

. 
. *Define large investees as investees making up 0.25% of a large shareholder'
> s portfolio, following Kedia et al. (2017)
. *0.25% is the 75th percentile in 13F universe
. *Results are robust to 90th percentile cutoff (i.e., 1.1%)
. gen large75=1 if ior_b>=0.0025
(187,430 missing values generated)

. replace large75=0 if large75==.
(187,430 real changes made)

. *The results are robust to a stricter definition of Sister, which equals one 
> if an investee firm accounts for at least 5% of the portfolio of the rater's 
> large shareholder in a given year, and zero otherwise.
. *gen large75=1 if ior_b>=0.05
. *replace large75=0 if large75==.
. sort mgrno gvkey  rdate

. gen year=year(rdate)

. sort mgrno gvkey  year rdate

. by mgrno gvkey year: gen each_quarter=_N

. by mgrno gvkey year: egen min_large75=min(large75)

. 
. *Require large shareholders to hold rater's shares for 2 quarters in a year,
>  results robust to 3- or 4-quarter cutoffs
. gen large_investee1=1 if each_quarter>=2&min_large75==1
(189,026 missing values generated)

. replace large_investee1=0 if large_investee1==.
(189,026 real changes made)

. drop large75 each_quarter min_large75 

. save "$wip/rater_investee.dta", replace
(file /Volumes/T9/stata data/wip/rater_investee.dta not found)
file /Volumes/T9/stata data/wip/rater_investee.dta saved

. 
. *Step 2.2 Keep only large investee firms held by the ESG rater's large shareh
> olders;
. use "$wip/rater_investee.dta", clear

. keep if large_investee1==1
(189,026 observations deleted)

. merge m:1 mgrno using "$output/rater_blockholder_list.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                             0
    Matched                             5,876  (_merge==3)
    -----------------------------------------

. drop _merge

. order gvkey rdate  mgrno large_quarter

. sort gvkey rdate large_quarter

. by gvkey rdate: gen num_hold_temp=_N

. sort gvkey year rdate

. by gvkey year: egen num_hold=mean(num_hold_temp)

. drop num_hold_temp

. sort gvkey rdate large_quarter

. 
. *If multiple large shareholders hold a large investee on the same day, we ret
> ain the one 
. *who has held the rater as a large shareholder for the longest period (in qua
> rters).
. by gvkey rdate: keep if _N==_n
(1,580 observations deleted)

. save "$wip/large_investee_unique", replace
(file /Volumes/T9/stata data/wip/large_investee_unique.dta not found)
file /Volumes/T9/stata data/wip/large_investee_unique.dta saved

. 
. *Construct relevant characteristic variables such as size_of_stake, defined a
> s 
. *the average weight of an investee firm in the portfolios of the rater's larg
> e shareholders within a given year.
. use "$wip/large_investee_unique", clear

. sort  mgrno gvkey year rdate

. order  mgrno gvkey year rdate

. by mgrno gvkey year: egen size_of_stake=mean(ior_b)

. by mgrno gvkey: gen duration_of_stake=_N

. save "$wip/large_investee_unique1", replace
(file /Volumes/T9/stata data/wip/large_investee_unique1.dta not found)
file /Volumes/T9/stata data/wip/large_investee_unique1.dta saved

. 
. *Step 2.3 Convert data from quarterly to annual frequency by keeping the last
>  available observation for each large investee firm in a year
. *Results are robust if we keep the annual observation where the investee firm
>  is held by
. *blockholders with either the largest ownership in the rater or the longest h
> olding period of the rater.
. use  "$wip/large_investee_unique1", clear

. drop ior_a ior_b large_investee1  

. generate large_investee_prior=0

. sort gvkey year rdate

. order gvkey year rdate mgrno

. by gvkey year: keep if _n==_N
(3,074 observations deleted)

. save "$wip/large_investee_final", replace
(file /Volumes/T9/stata data/wip/large_investee_final.dta not found)
file /Volumes/T9/stata data/wip/large_investee_final.dta saved

. 
. *Step 2.4 Indentify pseudo sisters, defined as large investee firms held by t
> he same group of large shareholders prior to them being classified as large s
> hareholders of the rating agency
. use "$input/ownership_final1.dta", clear

. keep if rdate<=date("29dec2009","DMY") & rdate>=date("29mar2003","DMY")
(43,413,261 observations deleted)

. merge m:1 mgrno using "$output/rater_blockholder_list.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                    17,097,853
        from master                17,097,853  (_merge==1)
        from using                          0  (_merge==2)

    Matched                           542,429  (_merge==3)
    -----------------------------------------

. keep if _merge==3
(17,097,853 observations deleted)

. drop _merge

. gen large75=1 if ior_b>=0.0025
(514,913 missing values generated)

. replace large75=0 if large75==.
(514,913 real changes made)

. *The results are robust to a stricter definition of pseudo sister, which equa
> ls one if an investee firm accounts for at least 5% of the portfolio of these
> s large shareholders in a given year, and zero otherwise.
. *gen large75=1 if ior_b>=0.05
. *replace large75=0 if large75==.
. gen year=year(rdate)

. sort mgrno gvkey  year rdate

. by mgrno gvkey year: gen each_quarter=_N

. by mgrno gvkey year: egen min_large75=min(large75)

. gen large_investee1=1 if each_quarter>=2&min_large75==1
(522,910 missing values generated)

. replace large_investee1=0 if large_investee1==.
(522,910 real changes made)

. drop large75  each_quarter min_large75 

. keep if large_investee1==1
(522,910 observations deleted)

. sort gvkey rdate large_quarter

. by gvkey rdate: keep if _N==_n
(8,933 observations deleted)

. generate large_investee_prior=1

. drop ior_a ior_b large_investee1 

. sort gvkey year rdate

. by gvkey year: keep if _n==_N
(7,797 observations deleted)

. save "$wip/large_investee_unique_before.dta", replace
(file /Volumes/T9/stata data/wip/large_investee_unique_before.dta not found)
file /Volumes/T9/stata data/wip/large_investee_unique_before.dta saved

. 
. *Step 2.5 Append the sample of sister firms and pseudo-sister firms into a co
> mbined dataset.
. use "$wip/large_investee_final",  clear

. generate large_investee=1

. append using  "$wip/large_investee_unique_before.dta"

. replace large_investee=0 if large_investee==.
(2,789 real changes made)

. save "$output/all_large_investee.dta",  replace
(file /Volumes/T9/stata data/output/all_large_investee.dta not found)
file /Volumes/T9/stata data/output/all_large_investee.dta saved

-------------------------------------------------------------------------------
      
2    ***********************************************************************
3      This code cleans the ESG ratings from the focal ESG rating provider
4    ***********************************************************************;
5    
6    libname input 'D:\sas data\input';
NOTE: Libref INPUT was successfully assigned as follows: 
      Engine:        V9 
      Physical Name: D:\sas data\input
7    libname wip 'D:\sas data\wip';
NOTE: Libref WIP was successfully assigned as follows: 
      Engine:        V9 
      Physical Name: D:\sas data\wip
8    libname output 'D:\sas data\output';
NOTE: Libref OUTPUT was successfully assigned as follows: 
      Engine:        V9 
      Physical Name: D:\sas data\output
9    
10   *A company's score in each of the seven broad categoriesenvironment, community, human
10   rights, employee relations,
11   diversity, product, and governanceis calculated as the difference between the number of
11   strength indicators and
12   the number of concern indicators;
13   data wip.kld_esg_iden1;
14   set input.kld_esg_iden;
15   ENV_TOTAL=ENV_str_TOTAL-ENV_con_TOTAL;
16   COM_TOTAL=COM_str_TOTAL-COM_con_TOTAL;
17   HUM_TOTAL=HUM_str_TOTAL-HUM_con_TOTAL;
18   EMP_TOTAL=EMP_str_TOTAL-EMP_con_TOTAL;
19   DIV_TOTAL=DIV_str_TOTAL-DIV_con_TOTAL;
20   PRO_TOTAL=PRO_str_TOTAL-PRO_con_TOTAL;
21   CGOV_TOTAL=CGOV_str_TOTAL-CGOV_con_TOTAL;
22   run;
  
NOTE: There were 45271 observations read from the data set INPUT.KLD_ESG_IDEN.
NOTE: The data set WIP.KLD_ESG_IDEN1 has 45271 observations and 31 variables.
NOTE: DATA statement used (Total process time):
      real time           0.08 seconds
      cpu time            0.03 seconds
      

23   
24   *Assign 0 to missing values;
25   data wip.kld_esg_iden2;
26   set wip.kld_esg_iden1;
27      array change _numeric_;
28               do over change;
29               if change=. then change=0;
30               end;
31      run ;

NOTE: There were 45271 observations read from the data set WIP.KLD_ESG_IDEN1.
NOTE: The data set WIP.KLD_ESG_IDEN2 has 45271 observations and 31 variables.
NOTE: DATA statement used (Total process time):
      real time           0.06 seconds
      cpu time            0.04 seconds
      

32   
33   *Calculate the overall ESG rating by aggregating scores across the individual categories.
34   In addition, compute other aggregated ratings such as ESG Strengths, ESG Concerns, Social
34   pillar ratings, and etc;
35   data wip.kld_esg_iden3;
36   set wip.kld_esg_iden2;
37   ESG_TOTAL=ENV_TOTAL+COM_TOTAL+EMP_TOTAL+DIV_TOTAL+PRO_TOTAL+HUM_TOTAL+CGOV_TOTAL;
38   ESG_str_TOTAL=ENV_str_TOTAL+COM_str_TOTAL+EMP_str_TOTAL+DIV_str_TOTAL+PRO_str_TOTAL+HUM_st
38   r_TOTAL+CGOV_str_TOTAL;
39   ESG_con_TOTAL=ENV_con_TOTAL+COM_con_TOTAL+EMP_con_TOTAL+DIV_con_TOTAL+PRO_con_TOTAL+HUM_co
39   n_TOTAL+CGOV_con_TOTAL;
40   SOCIAL_TOTAL=COM_TOTAL+EMP_TOTAL+DIV_TOTAL+PRO_TOTAL+HUM_TOTAL;
41   SOCIAL_con_TOTAL=COM_con_TOTAL+EMP_con_TOTAL+DIV_con_TOTAL+PRO_con_TOTAL+HUM_con_TOTAL;
42   SOCIAL_str_TOTAL=COM_str_TOTAL+EMP_str_TOTAL+DIV_str_TOTAL+PRO_str_TOTAL+HUM_str_TOTAL;
43   run;

NOTE: There were 45271 observations read from the data set WIP.KLD_ESG_IDEN2.
NOTE: The data set WIP.KLD_ESG_IDEN3 has 45271 observations and 37 variables.
NOTE: DATA statement used (Total process time):
      real time           0.05 seconds
      cpu time            0.04 seconds
      

44   
45   proc sql;
45             create table wip.kld_esg_iden4 as
46     select a.ESG_TOTAL, a.social_total, a.ESG_str_TOTAL, a.social_str_total,a.ESG_con_TOTAL,
46     a.social_con_total,b.*
47     from  wip.kld_esg_iden3 a, wip.kld_esg_iden1 b
48     where a.year=b.year and  a.permno=b.permno
49     order by  year, permno;
NOTE: Table WIP.KLD_ESG_IDEN4 created, with 45271 rows and 37 columns.

50   quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           0.09 seconds
      cpu time            0.04 seconds
      

51   
52   *Rename variables;
53   proc sql;
54   create table wip.esg_score as
55   select
56   YEAR as KLD_YEAR, CUSIP as KLD_CUSIP, ESG_TOTAL as KLD_ESG, ENV_TOTAL as KLD_ENV,
56   SOCIAL_TOTAL as KLD_SOCIAL, CGOV_TOTAL as KLD_GOV, ESG_con_TOTAL as KLD_ESG_CON,
56   ESG_str_TOTAL as KLD_ESG_STR,  SOCIAL_con_TOTAL as KLD_SOCIAL_CON, SOCIAL_str_TOTAL as
56   KLD_SOCIAL_STR,ENV_str_TOTAL as KLD_ENV_STR, ENV_con_TOTAL as KLD_ENV_CON, CGOV_str_TOTAL
56   as KLD_GOV_STR,CGOV_con_TOTAL as KLD_GOV_CON,
57   COM_TOTAL as KLD_COM, COM_str_TOTAL as KLD_COM_STR, COM_con_TOTAL as KLD_COM_CON,
57   HUM_TOTAL as KLD_HUM, HUM_str_TOTAL as KLD_HUM_STR, HUM_con_TOTAL as KLD_HUM_CON,
57   EMP_TOTAL as KLD_EMP, EMP_str_TOTAL as KLD_EMP_STR, EMP_con_TOTAL as KLD_EMP_CON,
57   DIV_TOTAL as KLD_DIV, DIV_str_TOTAL as KLD_DIV_STR, DIV_con_TOTAL as KLD_DIV_CON,
57   PRO_TOTAL as KLD_PRO, PRO_str_TOTAL as KLD_PRO_STR, PRO_con_TOTAL as KLD_PRO_CON
58   from   wip.kld_esg_iden4;
NOTE: Table WIP.ESG_SCORE created, with 45271 rows and 29 columns.

59   quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           0.03 seconds
      cpu time            0.00 seconds
      

60   
61   *Incorporate firm identifiers (gvkey) into the KLD ESG dataset;
62   *cc_id contains the linking table between cusip and gvkey obtained from the CRSP/Compustat
62    Merged database;
63   data wip.cc_id_;
64   set input.cc_id;
65   comp_cusip8=substr(comp_cusip,1,8);
66   drop comp_cusip;
67   rename comp_cusip8=comp_cusip;
68   run;

NOTE: There were 326938 observations read from the data set INPUT.CC_ID.
NOTE: The data set WIP.CC_ID_ has 326938 observations and 12 variables.
NOTE: DATA statement used (Total process time):
      real time           0.28 seconds
      cpu time            0.03 seconds
      

69   
70   proc sort data=wip.cc_id_ nodupkey; by crsp_cusip; run;

NOTE: There were 326938 observations read from the data set WIP.CC_ID_.
NOTE: 300130 observations with duplicate key values were deleted.
NOTE: The data set WIP.CC_ID_ has 26808 observations and 12 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.06 seconds
      cpu time            0.10 seconds
      

71   
72   proc sql;
72             create table wip.KLD_ESG_ALL as
73    select a.*,b.*
74    from wip.cc_id_ as a, wip.esg_score as b
75    where a.crsp_cusip=b.kld_cusip
76    order by a.gvkey, a.crsp_year;
NOTE: Table WIP.KLD_ESG_ALL created, with 44074 rows and 41 columns.

77   quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           0.06 seconds
      cpu time            0.03 seconds
      

78   
79   *Keep relevant variables;
80   proc sql;
81    create table wip.KLD_ESG_FINAL as
82    select
83   KLD_year,GVKEY, comp_conm, KLD_ESG,KLD_ENV, KLD_SOCIAL as KLD_SOC, KLD_GOV,
83   KLD_ESG_STR,KLD_ESG_CON,KLD_ENV_STR,KLD_ENV_CON,KLD_SOCIAL_STR as KLD_SOC_STR,
83   KLD_SOCIAL_CON AS KLD_SOC_CON, KLD_GOV_STR,KLD_GOV_CON,
84   KLD_COM, KLD_COM_STR, KLD_COM_CON, KLD_HUM, KLD_HUM_STR, KLD_HUM_CON, KLD_EMP,
84   KLD_EMP_STR, KLD_EMP_CON, KLD_DIV, KLD_DIV_STR, KLD_DIV_CON, KLD_PRO, KLD_PRO_STR,
84   KLD_PRO_CON
85   from wip.KLD_ESG_ALL;
NOTE: Table WIP.KLD_ESG_FINAL created, with 44074 rows and 30 columns.

86   quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           0.04 seconds
      cpu time            0.01 seconds
      

87   
88   proc sort data=wip.KLD_ESG_FINAL nodupkey out=output.KLD_ESG_FINAL; by gvkey kld_year; run
88   ;

NOTE: There were 44074 observations read from the data set WIP.KLD_ESG_FINAL.
NOTE: 131 observations with duplicate key values were deleted.
NOTE: The data set OUTPUT.KLD_ESG_FINAL has 43943 observations and 30 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.05 seconds
      cpu time            0.03 seconds
      

89   
90   *sas to stata data conversion;
91   proc export data=output.KLD_ESG_FINAL outfile= "D:\stata data\input\KLD_ESG_FINAL.dta"
91   replace;
92   run;

NOTE: The export data set has 43943 observations and 30 variables.
NOTE: "D:\stata data\input\KLD_ESG_FINAL.dta" file was successfully created.
NOTE: PROCEDURE EXPORT used (Total process time):
      real time           0.06 seconds
      cpu time            0.03 seconds
      

NOTE: SAS Institute Inc., SAS Campus Drive, Cary, NC USA 27513-2414
NOTE: The SAS System used:
      real time           1:22.74
      cpu time            3.67 seconds
      
-------------------------------------------------------------------------------
. 
. /*===========================================================================
> ==
>       Perform additional processing on ESG ratings from the focal provider
>  ============================================================================
> =*/
. 
. 
. global input "/Volumes/T9/stata data/input"

. global wip "/Volumes/T9/stata data/wip"

. global output "/Volumes/T9/stata data/output"

. 
. set more off

. set type double

. 
. *****************************************************************************
> *********************
. /* In addition to the main KLD ESG rating measure, we also construct a 0100 
> scaled version. 
> To do so, we first normalize the ratings to a 01 range within each year, the
> n multiply by 100. */
. *****************************************************************************
> *********************
. 
. use "$input/kld_esg_final.dta", clear

. rename kld_year esg_year

. sort esg_year

. *Normalize the kld esg between 0 and 1
. by esg_year: egen kld_esg_max = max(kld_esg) 

. by esg_year: egen kld_esg_min = min(kld_esg) 

. gen kld_esg_pct = (kld_esg - kld_esg_min) / (kld_esg_max-kld_esg_min)

. gen kld_esg_pct_= kld_esg_pct*100

. save "$output/kld_esg.dta", replace
file /Volumes/T9/stata data/output/kld_esg.dta saved

-------------------------------------------------------------------------------

2    
3    ***********************************************************************
4      This code cleans the raw ESG ratings from other ESG rating providers
5    ***********************************************************************;
6    
7    libname input 'D:\sas data\input';
NOTE: Libref INPUT was successfully assigned as follows: 
      Engine:        V9 
      Physical Name: D:\sas data\input
8    libname wip 'D:\sas data\wip';
NOTE: Libref WIP was successfully assigned as follows: 
      Engine:        V9 
      Physical Name: D:\sas data\wip
9    libname output 'D:\sas data\output';
NOTE: Libref OUTPUT was successfully assigned as follows: 
      Engine:        V9 
      Physical Name: D:\sas data\output
10   
11   /* Step1. Process Bloomberg ESG data */
12   *Import raw Excel files containing Bloomberg ESG data;
13   proc import datafile="D:\Bloomberg_ESG\2004" out=wip.b2004
14         dbms=XLSX replace;
15         sheet="Worksheet";
16   run;

NOTE:    Variable Name Change.  Gov Disc Sc:CY -> Gov_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Soc Disc Sc:CY -> Soc_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Env Disc Sc:CY -> Env_Disc_Sc_CY                  
NOTE:    Variable Name Change.  ESG Disc Score:CY -> ESG_Disc_Score_CY               
NOTE: The import data set has 3022 observations and 8 variables.
NOTE: WIP.B2004 data set was successfully created.
NOTE: PROCEDURE IMPORT used (Total process time):
      real time           0.21 seconds
      cpu time            0.17 seconds
      

17   
18   proc import datafile="D:\Bloomberg_ESG\2005" out=wip.b2005
19         dbms=XLSX replace;
20         sheet="Worksheet";
21   run;

NOTE:    Variable Name Change.  Gov Disc Sc:CY -> Gov_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Soc Disc Sc:CY -> Soc_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Env Disc Sc:CY -> Env_Disc_Sc_CY                  
NOTE:    Variable Name Change.  ESG Disc Score:CY -> ESG_Disc_Score_CY               
NOTE: The import data set has 3008 observations and 8 variables.
NOTE: WIP.B2005 data set was successfully created.
NOTE: PROCEDURE IMPORT used (Total process time):
      real time           0.21 seconds
      cpu time            0.18 seconds
      

22   
23   proc import datafile="D:\Bloomberg_ESG\2006" out=wip.b2006
24         dbms=XLSX replace;
25         sheet="Worksheet";
26   run;

NOTE:    Variable Name Change.  Gov Disc Sc:CY -> Gov_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Soc Disc Sc:CY -> Soc_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Env Disc Sc:CY -> Env_Disc_Sc_CY                  
NOTE:    Variable Name Change.  ESG Disc Score:CY -> ESG_Disc_Score_CY               
NOTE: The import data set has 2979 observations and 8 variables.
NOTE: WIP.B2006 data set was successfully created.
NOTE: PROCEDURE IMPORT used (Total process time):
      real time           0.20 seconds
      cpu time            0.18 seconds
      

27   
28   proc import datafile="D:\Bloomberg_ESG\2007" out=wip.b2007
29         dbms=XLSX replace;
30         sheet="Worksheet";
31   run;

NOTE:    Variable Name Change.  Gov Disc Sc:CY -> Gov_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Soc Disc Sc:CY -> Soc_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Env Disc Sc:CY -> Env_Disc_Sc_CY                  
NOTE:    Variable Name Change.  ESG Disc Score:CY -> ESG_Disc_Score_CY               
NOTE: The import data set has 2931 observations and 8 variables.
NOTE: WIP.B2007 data set was successfully created.
NOTE: PROCEDURE IMPORT used (Total process time):
      real time           0.19 seconds
      cpu time            0.17 seconds
      

32   
33   proc import datafile="D:\Bloomberg_ESG\2008" out=wip.b2008
34         dbms=XLSX replace;
35         sheet="Worksheet";
36   run;

NOTE:    Variable Name Change.  Gov Disc Sc:CY -> Gov_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Soc Disc Sc:CY -> Soc_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Env Disc Sc:CY -> Env_Disc_Sc_CY                  
NOTE:    Variable Name Change.  ESG Disc Score:CY -> ESG_Disc_Score_CY               
NOTE: The import data set has 2924 observations and 8 variables.
NOTE: WIP.B2008 data set was successfully created.
NOTE: PROCEDURE IMPORT used (Total process time):
      real time           0.21 seconds
      cpu time            0.20 seconds
      

37   
38   proc import datafile="D:\Bloomberg_ESG\2009" out=wip.b2009
39         dbms=XLSX replace;
40         sheet="Worksheet";
41   run;

NOTE:    Variable Name Change.  Gov Disc Sc:CY -> Gov_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Soc Disc Sc:CY -> Soc_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Env Disc Sc:CY -> Env_Disc_Sc_CY                  
NOTE:    Variable Name Change.  ESG Disc Score:CY -> ESG_Disc_Score_CY               
NOTE: The import data set has 2972 observations and 8 variables.
NOTE: WIP.B2009 data set was successfully created.
NOTE: PROCEDURE IMPORT used (Total process time):
      real time           0.21 seconds
      cpu time            0.20 seconds
      

42   
43   proc import datafile="D:\Bloomberg_ESG\2010" out=wip.b2010
44         dbms=XLSX replace;
45         sheet="Worksheet";
46   run;

NOTE:    Variable Name Change.  Gov Disc Sc:CY -> Gov_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Soc Disc Sc:CY -> Soc_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Env Disc Sc:CY -> Env_Disc_Sc_CY                  
NOTE:    Variable Name Change.  ESG Disc Score:CY -> ESG_Disc_Score_CY               
NOTE: The import data set has 2952 observations and 8 variables.
NOTE: WIP.B2010 data set was successfully created.
NOTE: PROCEDURE IMPORT used (Total process time):
      real time           0.20 seconds
      cpu time            0.20 seconds
      

47   
48   proc import datafile="D:\Bloomberg_ESG\2011" out=wip.b2011
49         dbms=XLSX replace;
50         sheet="Worksheet";
51   run;

NOTE:    Variable Name Change.  Gov Disc Sc:CY -> Gov_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Soc Disc Sc:CY -> Soc_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Env Disc Sc:CY -> Env_Disc_Sc_CY                  
NOTE:    Variable Name Change.  ESG Disc Score:CY -> ESG_Disc_Score_CY               
NOTE: The import data set has 2945 observations and 8 variables.
NOTE: WIP.B2011 data set was successfully created.
NOTE: PROCEDURE IMPORT used (Total process time):
      real time           0.21 seconds
      cpu time            0.21 seconds
      

52   
53   proc import datafile="D:\Bloomberg_ESG\2012" out=wip.b2012
54         dbms=XLSX replace;
55         sheet="Worksheet";
56   run;

NOTE:    Variable Name Change.  Gov Disc Sc:CY -> Gov_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Soc Disc Sc:CY -> Soc_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Env Disc Sc:CY -> Env_Disc_Sc_CY                  
NOTE:    Variable Name Change.  ESG Disc Score:CY -> ESG_Disc_Score_CY               
NOTE: The import data set has 2969 observations and 8 variables.
NOTE: WIP.B2012 data set was successfully created.
NOTE: PROCEDURE IMPORT used (Total process time):
      real time           0.21 seconds
      cpu time            0.18 seconds
      

57   
58   proc import datafile="D:\Bloomberg_ESG\2013" out=wip.b2013
59         dbms=XLSX replace;
60         sheet="Worksheet";
61   run;

NOTE:    Variable Name Change.  Gov Disc Sc:CY -> Gov_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Soc Disc Sc:CY -> Soc_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Env Disc Sc:CY -> Env_Disc_Sc_CY                  
NOTE:    Variable Name Change.  ESG Disc Score:CY -> ESG_Disc_Score_CY               
NOTE: The import data set has 3019 observations and 8 variables.
NOTE: WIP.B2013 data set was successfully created.
NOTE: PROCEDURE IMPORT used (Total process time):
      real time           0.22 seconds
      cpu time            0.20 seconds
      

62   
63   proc import datafile="D:\Bloomberg_ESG\2014" out=wip.b2014
64         dbms=XLSX replace;
65         sheet="Worksheet";
66   run;

NOTE:    Variable Name Change.  Gov Disc Sc:CY -> Gov_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Soc Disc Sc:CY -> Soc_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Env Disc Sc:CY -> Env_Disc_Sc_CY                  
NOTE:    Variable Name Change.  ESG Disc Score:CY -> ESG_Disc_Score_CY               
NOTE: The import data set has 3054 observations and 8 variables.
NOTE: WIP.B2014 data set was successfully created.
NOTE: PROCEDURE IMPORT used (Total process time):
      real time           0.23 seconds
      cpu time            0.21 seconds
      

67   
68   proc import datafile="D:\Bloomberg_ESG\2015" out=wip.b2015
69         dbms=XLSX replace;
70         sheet="Worksheet";
71   run;

NOTE:    Variable Name Change.  Gov Disc Sc:CY -> Gov_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Soc Disc Sc:CY -> Soc_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Env Disc Sc:CY -> Env_Disc_Sc_CY                  
NOTE:    Variable Name Change.  ESG Disc Score:CY -> ESG_Disc_Score_CY               
NOTE: The import data set has 3019 observations and 8 variables.
NOTE: WIP.B2015 data set was successfully created.
NOTE: PROCEDURE IMPORT used (Total process time):
      real time           0.21 seconds
      cpu time            0.18 seconds
      

72   
73   proc import datafile="D:\Bloomberg_ESG\2016" out=wip.b2016
74         dbms=XLSX replace;
75         sheet="Worksheet";
76   run;

NOTE:    Variable Name Change.  Gov Disc Sc:CY -> Gov_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Soc Disc Sc:CY -> Soc_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Env Disc Sc:CY -> Env_Disc_Sc_CY                  
NOTE:    Variable Name Change.  ESG Disc Score:CY -> ESG_Disc_Score_CY               
NOTE: The import data set has 2977 observations and 8 variables.
NOTE: WIP.B2016 data set was successfully created.
NOTE: PROCEDURE IMPORT used (Total process time):
      real time           0.21 seconds
      cpu time            0.20 seconds
      

77   
78   proc import datafile="D:\Bloomberg_ESG\2017" out=wip.b2017
79         dbms=XLSX replace;
80         sheet="Worksheet";
81   run;

NOTE:    Variable Name Change.  Gov Disc Sc:CY -> Gov_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Soc Disc Sc:CY -> Soc_Disc_Sc_CY                  
NOTE:    Variable Name Change.  Env Disc Sc:CY -> Env_Disc_Sc_CY                  
NOTE:    Variable Name Change.  ESG Disc Score:CY -> ESG_Disc_Score_CY               
NOTE: The import data set has 2960 observations and 8 variables.
NOTE: WIP.B2017 data set was successfully created.
NOTE: PROCEDURE IMPORT used (Total process time):
      real time           0.22 seconds
      cpu time            0.18 seconds
      

82   
83   *Generate year variables;
84   data wip.B2004;
85   set wip.B2004;
86   year=2004;
87   run;

NOTE: There were 3022 observations read from the data set WIP.B2004.
NOTE: The data set WIP.B2004 has 3022 observations and 9 variables.
NOTE: DATA statement used (Total process time):
      real time           0.01 seconds
      cpu time            0.00 seconds
      

88   
89   data wip.B2005;
90   set wip.B2005;
91   year=2005;
92   run;

NOTE: There were 3008 observations read from the data set WIP.B2005.
NOTE: The data set WIP.B2005 has 3008 observations and 9 variables.
NOTE: DATA statement used (Total process time):
      real time           0.01 seconds
      cpu time            0.01 seconds
      

93   
94   data wip.B2006;
95   set wip.B2006;
96   year=2006;
97   run;

NOTE: There were 2979 observations read from the data set WIP.B2006.
NOTE: The data set WIP.B2006 has 2979 observations and 9 variables.
NOTE: DATA statement used (Total process time):
      real time           0.03 seconds
      cpu time            0.01 seconds
      

98   
99   data wip.B2007;
100  set wip.B2007;
101  year=2007;
102  run;

NOTE: There were 2931 observations read from the data set WIP.B2007.
NOTE: The data set WIP.B2007 has 2931 observations and 9 variables.
NOTE: DATA statement used (Total process time):
      real time           0.02 seconds
      cpu time            0.01 seconds
      

103  
104  data wip.B2008;
105  set wip.B2008;
106  year=2008;
107  run;

NOTE: There were 2924 observations read from the data set WIP.B2008.
NOTE: The data set WIP.B2008 has 2924 observations and 9 variables.
NOTE: DATA statement used (Total process time):
      real time           0.01 seconds
      cpu time            0.00 seconds
      

108  
109  data wip.B2009;
110  set wip.B2009;
111  year=2009;
112  run;

NOTE: There were 2972 observations read from the data set WIP.B2009.
NOTE: The data set WIP.B2009 has 2972 observations and 9 variables.
NOTE: DATA statement used (Total process time):
      real time           0.01 seconds
      cpu time            0.01 seconds
      

113  
114  data wip.B2010;
115  set wip.B2010;
116  year=2010;
117  run;

NOTE: There were 2952 observations read from the data set WIP.B2010.
NOTE: The data set WIP.B2010 has 2952 observations and 9 variables.
NOTE: DATA statement used (Total process time):
      real time           0.03 seconds
      cpu time            0.03 seconds
      

118  
119  data wip.B2011;
120  set wip.B2011;
121  year=2011;
122  run;

NOTE: There were 2945 observations read from the data set WIP.B2011.
NOTE: The data set WIP.B2011 has 2945 observations and 9 variables.
NOTE: DATA statement used (Total process time):
      real time           0.02 seconds
      cpu time            0.00 seconds
      

123  
124  data wip.B2012;
125  set wip.B2012;
126  year=2012;
127  run;

NOTE: There were 2969 observations read from the data set WIP.B2012.
NOTE: The data set WIP.B2012 has 2969 observations and 9 variables.
NOTE: DATA statement used (Total process time):
      real time           0.01 seconds
      cpu time            0.00 seconds
      

128  
129  data wip.B2013;
130  set wip.B2013;
131  year=2013;
132  run;

NOTE: There were 3019 observations read from the data set WIP.B2013.
NOTE: The data set WIP.B2013 has 3019 observations and 9 variables.
NOTE: DATA statement used (Total process time):
      real time           0.01 seconds
      cpu time            0.00 seconds
      

133  
134  data wip.B2014;
135  set wip.B2014;
136  year=2014;
137  run;

NOTE: There were 3054 observations read from the data set WIP.B2014.
NOTE: The data set WIP.B2014 has 3054 observations and 9 variables.
NOTE: DATA statement used (Total process time):
      real time           0.03 seconds
      cpu time            0.03 seconds
      

138  
139  data wip.B2015;
140  set wip.B2015;
141  year=2015;
142  run;

NOTE: There were 3019 observations read from the data set WIP.B2015.
NOTE: The data set WIP.B2015 has 3019 observations and 9 variables.
NOTE: DATA statement used (Total process time):
      real time           0.02 seconds
      cpu time            0.00 seconds
      

143  
144  data wip.B2016;
145  set wip.B2016;
146  year=2016;
147  run;

NOTE: There were 2977 observations read from the data set WIP.B2016.
NOTE: The data set WIP.B2016 has 2977 observations and 9 variables.
NOTE: DATA statement used (Total process time):
      real time           0.01 seconds
      cpu time            0.00 seconds
      

148  
149  data wip.B2017;
150  set wip.B2017;
151  year=2017;
152  run;

NOTE: There were 2960 observations read from the data set WIP.B2017.
NOTE: The data set WIP.B2017 has 2960 observations and 9 variables.
NOTE: DATA statement used (Total process time):
      real time           0.01 seconds
      cpu time            0.01 seconds
      

153  
154  data wip.Bloomberg_ESG;
155  set wip.B2004 wip.B2005 wip.B2006 wip.B2007 wip.B2008 wip.B2009 wip.B2010 wip.B2011
155  wip.B2012 wip.B2013 wip.B2014 wip.B2015 wip.B2016 wip.B2017;
156  run;

NOTE: There were 3022 observations read from the data set WIP.B2004.
NOTE: There were 3008 observations read from the data set WIP.B2005.
NOTE: There were 2979 observations read from the data set WIP.B2006.
NOTE: There were 2931 observations read from the data set WIP.B2007.
NOTE: There were 2924 observations read from the data set WIP.B2008.
NOTE: There were 2972 observations read from the data set WIP.B2009.
NOTE: There were 2952 observations read from the data set WIP.B2010.
NOTE: There were 2945 observations read from the data set WIP.B2011.
NOTE: There were 2969 observations read from the data set WIP.B2012.
NOTE: There were 3019 observations read from the data set WIP.B2013.
NOTE: There were 3054 observations read from the data set WIP.B2014.
NOTE: There were 3019 observations read from the data set WIP.B2015.
NOTE: There were 2977 observations read from the data set WIP.B2016.
NOTE: There were 2960 observations read from the data set WIP.B2017.
NOTE: The data set WIP.BLOOMBERG_ESG has 41731 observations and 9 variables.
NOTE: DATA statement used (Total process time):
      real time           0.03 seconds
      cpu time            0.01 seconds
      

157  
158  *Rename variables;
159  data wip.Bloomberg_ESG;
160  set wip.Bloomberg_ESG;
161  rename Ticker=Bloomberg_Ticker;
162  rename Name=Bloomberg_Name;
163  rename CUSIP=Bloomberg_Cusip;
164  rename ISIN=Bloomberg_ISIN;
165  rename Gov_Disc_Sc_CY=Bloomberg_GOV;
166  rename Soc_Disc_Sc_CY=Bloomberg_SOC;
167  rename Env_Disc_Sc_CY=Bloomberg_ENV;
168  rename ESG_Disc_Score_CY=Bloomberg_ESG;
169  rename year=Bloomberg_year;
170  run;

NOTE: There were 41731 observations read from the data set WIP.BLOOMBERG_ESG.
NOTE: The data set WIP.BLOOMBERG_ESG has 41731 observations and 9 variables.
NOTE: DATA statement used (Total process time):
      real time           0.02 seconds
      cpu time            0.00 seconds
      

171  
172  *Label variables;
173  data wip.Bloomberg_ESG;
174  set wip.Bloomberg_ESG;
175  label Bloomberg_Ticker="Bloomberg_Ticker";
176  label Bloomberg_Name="Bloomberg_Name";
177  label Bloomberg_Cusip="Bloomberg_Cusip";
178  label Bloomberg_ISIN="Bloomberg_ISN";
179  label Bloomberg_GOV="Bloomberg_GOV";
180  label Bloomberg_SOC="Bloomberg_SOC";
181  label Bloomberg_ENV="Bloomberg_ENV";
182  label Bloomberg_ESG="Bloomberg_ESG";
183  run;

NOTE: There were 41731 observations read from the data set WIP.BLOOMBERG_ESG.
NOTE: The data set WIP.BLOOMBERG_ESG has 41731 observations and 9 variables.
NOTE: DATA statement used (Total process time):
      real time           0.03 seconds
      cpu time            0.00 seconds
      

184  
185  *Convert character variables to numeric format;
186  data wip.Bloomberg_ESG;
187  set wip.Bloomberg_ESG;
188  nBloomberg_GOV=Bloomberg_GOV+0;
189  nBloomberg_SOC=Bloomberg_SOC+0;
190  nBloomberg_ENV=Bloomberg_ENV+0;
191  nBloomberg_ESG=Bloomberg_ESG+0;
192  drop Bloomberg_GOV Bloomberg_SOC Bloomberg_ENV Bloomberg_ESG ;
193  rename nBloomberg_GOV=Bloomberg_GOV;
194  rename nBloomberg_SOC=Bloomberg_SOC;
195  rename nBloomberg_ENV=Bloomberg_ENV;
196  rename nBloomberg_ESG=Bloomberg_ESG;
197  run;

NOTE: There were 41731 observations read from the data set WIP.BLOOMBERG_ESG.
NOTE: The data set WIP.BLOOMBERG_ESG has 41731 observations and 9 variables.
NOTE: DATA statement used (Total process time):
      real time           0.03 seconds
      cpu time            0.03 seconds
      

198  
199  *We first merge the Bloomberg ESG data with Capital IQ company identifiers (COMPANYID)
199  using ISIN as the matching key;
200  *Isin_capitaliqid is the Capital IQ linking table that connects ISIN to COMPANYID;
201  proc sql;
201           create table wip.Bloomberg_ESG_ID1 as
202    select a.*, b.*
203    from wip.Bloomberg_ESG a, input.Isin_capitaliqid b
204    where a.Bloomberg_ISIN=b.ISIN
205    order by b.COMPANYID, a.Bloomberg_year;
NOTE: Table WIP.BLOOMBERG_ESG_ID1 created, with 42031 rows and 14 columns.

206  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           13.04 seconds
      cpu time            2.12 seconds
      

207  
208  *We then merge the Bloomberg ESG data with gvkey using COMPANYID as the matching key;
209  *Capital_iq_linktable is the Capital IQ linking table that connects COMPANYID to gvkey;
210  proc sql;
210           create table wip.Bloomberg_ESG_ID2 as
211    select a.*, b.*
212    from wip.Bloomberg_ESG_ID1 a, input.Capital_iq_gvkey_ciqid_linktable b
213    where a.COMPANYID=b.COMPANYID
214    order by b.gvkey, a.Bloomberg_year;
NOTE: Table WIP.BLOOMBERG_ESG_ID2 created, with 42605 rows and 15 columns.

215  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           0.23 seconds
      cpu time            0.09 seconds
      

216  
217  *Keep relevant variables;
218  proc sql;
219   create table wip.Bloomberg_ESG_Final as
220   select
221  Bloomberg_year, COMPANYNAME, GVKEY, Bloomberg_ESG, Bloomberg_ENV, Bloomberg_SOC,
221  Bloomberg_GOV
222  from   wip.Bloomberg_ESG_ID2;
NOTE: Table WIP.BLOOMBERG_ESG_FINAL created, with 42605 rows and 7 columns.

223  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           0.05 seconds
      cpu time            0.03 seconds
      

224  
225  proc sort data=wip.Bloomberg_ESG_Final nodupkey; by gvkey Bloomberg_year; run;

NOTE: There were 42605 observations read from the data set WIP.BLOOMBERG_ESG_FINAL.
NOTE: 350 observations with duplicate key values were deleted.
NOTE: The data set WIP.BLOOMBERG_ESG_FINAL has 42255 observations and 7 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.06 seconds
      cpu time            0.01 seconds
      

226  
227  data output.Bloomberg_ESG_Final;
228  set wip.Bloomberg_ESG_Final;
229  if not missing (Bloomberg_ESG);
230  run;

NOTE: There were 42255 observations read from the data set WIP.BLOOMBERG_ESG_FINAL.
NOTE: The data set OUTPUT.BLOOMBERG_ESG_FINAL has 21901 observations and 7 variables.
NOTE: DATA statement used (Total process time):
      real time           0.04 seconds
      cpu time            0.00 seconds
      

231  
232  /* Step2. Process Refinitiv ESG data */
233  *Import raw Excel files containing Refinitiv ESG data;
234  proc import datafile="D:\Asset4_ESG\ASSET4_TR_ESG_NEW" out=wip.Refinitiv_TR_ESG_NEW
235        dbms=XLSX replace;
236        sheet="sheet1";
237  run;

NOTE:    Variable Name Change.  ASSET4 Company Name -> ASSET4_Company_Name             
NOTE:    Variable Name Change.  ASSET4 Primary Exchange -> ASSET4_Primary_Exchange         
NOTE:    Variable Name Change.  ASSET4 Status -> ASSET4_Status                   
NOTE:    Variable Name Change.  LOC OFF. CODE -> LOC_OFF__CODE                   
NOTE: The import data set has 40512 observations and 13 variables.
NOTE: WIP.REFINITIV_TR_ESG_NEW data set was successfully created.
NOTE: PROCEDURE IMPORT used (Total process time):
      real time           3.63 seconds
      cpu time            3.54 seconds
      

238  
239  *Convert character variables to numeric format;
240  data wip.Refinitiv_TR_ESG1;
241  set wip.Refinitiv_TR_ESG_NEW;
242  if not missing (Asset4);
243  nTR_ESGC=TR_ESGC+0;
244  drop Asset4 TR_ESG TR_ESGC TR_CON Asset4_ENV Asset4_GOV Asset4_SOCIAL Asset4_ECON;
245  rename nTR_ESGC=Refinitiv_ESGC;
246  rename Year=Refinitiv_year;
247  run;

NOTE: There were 40512 observations read from the data set WIP.REFINITIV_TR_ESG_NEW.
NOTE: The data set WIP.REFINITIV_TR_ESG1 has 14550 observations and 6 variables.
NOTE: DATA statement used (Total process time):
      real time           0.01 seconds
      cpu time            0.00 seconds
      

248  
249  *Incorporate firm identifiers (gvkey) into the Refinitiv ESG dataset;
250  *cc_id contains the linking table between cusip and gvkey obtained from the CRSP/Compustat
250  Merged database;
251  data wip.cc_id1;
252  set input.cc_id;
253  comp_cusip8=substr(comp_cusip,1,8);
254  drop comp_cusip;
255  rename comp_cusip8=comp_cusip;
256  run;

NOTE: There were 326938 observations read from the data set INPUT.CC_ID.
NOTE: The data set WIP.CC_ID1 has 326938 observations and 12 variables.
NOTE: DATA statement used (Total process time):
      real time           0.12 seconds
      cpu time            0.07 seconds
      

257  
258  proc sort data=wip.cc_id1 nodupkey; by comp_cusip; run;

NOTE: There were 326938 observations read from the data set WIP.CC_ID1.
NOTE: 300655 observations with duplicate key values were deleted.
NOTE: The data set WIP.CC_ID1 has 26283 observations and 12 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.06 seconds
      cpu time            0.07 seconds
      

259  
260  data wip.cc_id2;
261  set wip.cc_id1;
262  a='U';
263  local_crsp_cusip = cats(a, crsp_cusip);
264  local_crsp_ncusip = cats(a, crsp_ncusip);
265  local_comp_cusip = cats(a, comp_cusip);
266  drop a;
267  run;

NOTE: There were 26283 observations read from the data set WIP.CC_ID1.
NOTE: The data set WIP.CC_ID2 has 26283 observations and 15 variables.
NOTE: DATA statement used (Total process time):
      real time           0.04 seconds
      cpu time            0.01 seconds
      

268  
269  proc sort data=wip.cc_id2 nodupkey; by local_comp_cusip; run;

NOTE: There were 26283 observations read from the data set WIP.CC_ID2.
NOTE: 0 observations with duplicate key values were deleted.
NOTE: The data set WIP.CC_ID2 has 26283 observations and 15 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.05 seconds
      cpu time            0.01 seconds
      

270  
271  proc sql;
271            create table wip.Refinitiv_TR_ESG2 as
272   select a.*,b.*
273   from wip.cc_id2 as a, wip.Refinitiv_TR_ESG1 as b
274   where a.local_comp_cusip=b.LOC_OFF__CODE
275   order by a.gvkey, a.crsp_year;
NOTE: Table WIP.REFINITIV_TR_ESG2 created, with 14235 rows and 21 columns.

276  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           0.05 seconds
      cpu time            0.04 seconds
      

277  
278  *Keep relevant variables;
279  proc sql;
280   create table wip.Refinitiv_ESG_FINAL as
281   select
282  Refinitiv_year,GVKEY, comp_conm, Refinitiv_ESGC
283  from   wip.Refinitiv_TR_ESG2;
NOTE: Table WIP.REFINITIV_ESG_FINAL created, with 14235 rows and 4 columns.

284  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           0.03 seconds
      cpu time            0.03 seconds
      

285  
286  proc sort data=wip.Refinitiv_ESG_FINAL nodupkey out=output.Refinitiv_ESG_FINAL; by gvkey
286  Refinitiv_year; run;

NOTE: There were 14235 observations read from the data set WIP.REFINITIV_ESG_FINAL.
NOTE: 0 observations with duplicate key values were deleted.
NOTE: The data set OUTPUT.REFINITIV_ESG_FINAL has 14235 observations and 4 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.01 seconds
      cpu time            0.00 seconds
      

287  
288  /* Step3. Process Sustainalytics ESG data */
289  *Import raw Excel files containing Sustainalytics ESG data;
290  proc import datafile="D:\Sustainalytics historical weighted
290  scores\sustainalytics_historical_weighted" out=wip.sustainalytics
291        dbms=XLSX replace;
292        sheet="sustainalytics_historical_weigh";
293  run;

NOTE: The import data set has 500445 observations and 182 variables.
NOTE: WIP.SUSTAINALYTICS data set was successfully created.
NOTE: PROCEDURE IMPORT used (Total process time):
      real time           4:12.88
      cpu time            4:08.92
      

294  
295  *Keep relevant variables;
296  data wip.sustainalytics_;
297  set wip.sustainalytics;
298  keep UNIQUE_ID UNIQUE_ID_new Date Company Type_Template Type_Listing CapitalIQ_ID
298  CompanyId Region Country ISIN total_esg_score governance_score social_score
298  environment_score;
299  run;

NOTE: There were 500445 observations read from the data set WIP.SUSTAINALYTICS.
NOTE: The data set WIP.SUSTAINALYTICS_ has 500445 observations and 15 variables.
NOTE: DATA statement used (Total process time):
      real time           0.43 seconds
      cpu time            0.26 seconds
      

300  
301  *Keep observations with non-missing CapitalIQ_ID for later merge with GVKEY;
302  data wip.sustainalytics_;
303  set wip.sustainalytics_;
304  if not missing (CapitalIQ_ID);
305  run;

NOTE: There were 500445 observations read from the data set WIP.SUSTAINALYTICS_.
NOTE: The data set WIP.SUSTAINALYTICS_ has 493365 observations and 15 variables.
NOTE: DATA statement used (Total process time):
      real time           0.27 seconds
      cpu time            0.07 seconds
      

306  
307  data wip.sustainalytics_;
308  set wip.sustainalytics_;
309      capital_iq_id = substr(CapitalIQ_ID, 3, lengthn(capital_iq_id));
310  run;

NOTE: There were 493365 observations read from the data set WIP.SUSTAINALYTICS_.
NOTE: The data set WIP.SUSTAINALYTICS_ has 493365 observations and 16 variables.
NOTE: DATA statement used (Total process time):
      real time           0.30 seconds
      cpu time            0.12 seconds
      

311  
312  proc sort data=wip.sustainalytics_ nodupkey ; by country capital_iq_id Date; run;

NOTE: There were 493365 observations read from the data set WIP.SUSTAINALYTICS_.
NOTE: 247 observations with duplicate key values were deleted.
NOTE: The data set WIP.SUSTAINALYTICS_ has 493118 observations and 16 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.36 seconds
      cpu time            0.36 seconds
      

313  
314  *Generate year variables;
315  data wip.sustainalytics_;
316  set wip.sustainalytics_;
317  year=substr(Date, 5, 4);
318  run;

NOTE: There were 493118 observations read from the data set WIP.SUSTAINALYTICS_.
NOTE: The data set WIP.SUSTAINALYTICS_ has 493118 observations and 17 variables.
NOTE: DATA statement used (Total process time):
      real time           0.35 seconds
      cpu time            0.10 seconds
      

319  
320  *Convert character variables to numeric format;
321  data wip.sustainalytics_;
322  set wip.sustainalytics_;
323  total_esg_score_ = total_esg_score*1;
324  run;
  
NOTE: There were 493118 observations read from the data set WIP.SUSTAINALYTICS_.
NOTE: The data set WIP.SUSTAINALYTICS_ has 493118 observations and 18 variables.
NOTE: DATA statement used (Total process time):
      real time           0.34 seconds
      cpu time            0.17 seconds
      

325  
326  *Convert quarterly data to annual frequency;
327  proc means data=wip.sustainalytics_ noprint;
328  by    country capital_iq_id year   ;
329  var total_esg_score_ ;
330  output out=wip.sustainalytics_annual (drop= _type_ _freq_ ) mean=total_esg_score_mean  ;
331  run;

NOTE: There were 493118 observations read from the data set WIP.SUSTAINALYTICS_.
NOTE: The data set WIP.SUSTAINALYTICS_ANNUAL has 50326 observations and 4 variables.
NOTE: PROCEDURE MEANS used (Total process time):
      real time           0.17 seconds
      cpu time            0.15 seconds
      

332  
333  proc means data=wip.sustainalytics_ noprint;
334  by    country capital_iq_id year   ;
335  var environment_score ;
336  output out=wip.sustainalytics_e (drop= _type_ _freq_ ) mean=environment_score_mean  ;
337  run;

NOTE: There were 493118 observations read from the data set WIP.SUSTAINALYTICS_.
NOTE: The data set WIP.SUSTAINALYTICS_E has 50326 observations and 4 variables.
NOTE: PROCEDURE MEANS used (Total process time):
      real time           0.15 seconds
      cpu time            0.15 seconds
      

338  
339  proc means data=wip.sustainalytics_ noprint;
340  by    country capital_iq_id year   ;
341  var social_score ;
342  output out=wip.sustainalytics_s (drop= _type_ _freq_ ) mean=social_score_mean  ;
343  run;

NOTE: There were 493118 observations read from the data set WIP.SUSTAINALYTICS_.
NOTE: The data set WIP.SUSTAINALYTICS_S has 50326 observations and 4 variables.
NOTE: PROCEDURE MEANS used (Total process time):
      real time           0.16 seconds
      cpu time            0.15 seconds
      

344  
345  proc means data=wip.sustainalytics_ noprint;
346  by    country capital_iq_id year   ;
347  var governance_score ;
348  output out=wip.sustainalytics_g (drop= _type_ _freq_ ) mean=governance_score_mean  ;
349  run;

NOTE: There were 493118 observations read from the data set WIP.SUSTAINALYTICS_.
NOTE: The data set WIP.SUSTAINALYTICS_G has 50326 observations and 4 variables.
NOTE: PROCEDURE MEANS used (Total process time):
      real time           0.25 seconds
      cpu time            0.18 seconds
      

350  
351  proc sql;
351            create table wip.sustainalytics_annual_esg as
352   select a.*,b.*,c.*,d.*
353   from wip.sustainalytics_annual as a, wip.sustainalytics_e as b,wip.sustainalytics_s as c,
353   wip.sustainalytics_g as d
354   where a.capital_iq_id=b.capital_iq_id=c.capital_iq_id=d.capital_iq_id and
354  a.year=b.year=c.year=d.year and a.country=b.country=c.country=d.country
355   order by a.capital_iq_id, a.year;
NOTE: Table WIP.SUSTAINALYTICS_ANNUAL_ESG created, with 50326 rows and 7 columns.

356  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           0.15 seconds
      cpu time            0.09 seconds
      

357  
358  *Merge the Sustainalytics ESG data with gvkey using capital_iq_id as the matching key;
359  *Capital_iq_linktable is the Capital IQ linking table that connects capital_iq_id
359  (COMPANYID) to gvkey;
360  data wip.Capital_iq_linktable;
361  set input.Capital_iq_gvkey_ciqid_linktable;
362  run;

NOTE: There were 125458 observations read from the data set 
      INPUT.CAPITAL_IQ_GVKEY_CIQID_LINKTABLE.
NOTE: The data set WIP.CAPITAL_IQ_LINKTABLE has 125458 observations and 5 variables.
NOTE: DATA statement used (Total process time):
      real time           0.16 seconds
      cpu time            0.06 seconds
      

363  
364  proc sql;
364            create table wip.sustainalytics_annual_ID as
365    select a.*, b.*
366    from wip.sustainalytics_annual_esg a, wip.Capital_iq_linktable b
367    where a.capital_iq_id=b.COMPANYID
368    order by a.capital_iq_id, a.year;
NOTE: Table WIP.SUSTAINALYTICS_ANNUAL_ID created, with 47529 rows and 12 columns.

369  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           0.16 seconds
      cpu time            0.09 seconds
      

370  
371  *Keep relevant variables;
372  proc sql;
373   create table wip.sustainalytics_esg_final as
374   select
375  year as sustainalytics_year, GVKEY,country, total_esg_score_mean as sustainalytics_ESG,
375  environment_score_mean as sustainalytics_E, social_score_mean as sustainalytics_S,
375  governance_score_mean as  sustainalytics_G
376  from   wip.sustainalytics_annual_ID;
NOTE: Table WIP.SUSTAINALYTICS_ESG_FINAL created, with 47529 rows and 7 columns.

377  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           0.03 seconds
      cpu time            0.01 seconds
      

378  
379  proc sort data=wip.sustainalytics_esg_final nodupkey out=output.sustainalytics_esg_final;
379  by gvkey sustainalytics_year; run;

NOTE: There were 47529 observations read from the data set WIP.SUSTAINALYTICS_ESG_FINAL.
NOTE: 343 observations with duplicate key values were deleted.
NOTE: The data set OUTPUT.SUSTAINALYTICS_ESG_FINAL has 47186 observations and 7 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.07 seconds
      cpu time            0.03 seconds
      

380  
381  /* Step4. Process Truvalue Labs ESG data */
382  proc sort data=input.Tv_esg_ranks_quarterly nodupkey out=wip.Tv_esg_ranks_quarterly; by
382  PERMNO tv_date; run;

NOTE: There were 329628 observations read from the data set INPUT.TV_ESG_RANKS_QUARTERLY.
NOTE: 128405 observations with duplicate key values were deleted.
NOTE: The data set WIP.TV_ESG_RANKS_QUARTERLY has 201223 observations and 10 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.20 seconds
      cpu time            0.14 seconds
      

383  
384  *Generate year variables;
385  data wip.Tv_esg_ranks_quarterly;
386  set wip.Tv_esg_ranks_quarterly;
387  year=year(tv_date);
388  run;

NOTE: There were 201223 observations read from the data set WIP.TV_ESG_RANKS_QUARTERLY.
NOTE: The data set WIP.TV_ESG_RANKS_QUARTERLY has 201223 observations and 11 variables.
NOTE: DATA statement used (Total process time):
      real time           0.10 seconds
      cpu time            0.06 seconds
      

389  
390  *Convert quarterly data to annual frequency;
391  proc means data=wip.Tv_esg_ranks_quarterly noprint;
392  by    PERMNO year   ;
393  var MATERIALITY_ADJ_INSIGHT;
394  output out=wip.Tv_esg_ranks_annual1 (drop= _type_ _freq_ )
394  mean=MATERIALITY_ADJ_INSIGHT_MEAN ;
395  run;

NOTE: There were 201223 observations read from the data set WIP.TV_ESG_RANKS_QUARTERLY.
NOTE: The data set WIP.TV_ESG_RANKS_ANNUAL1 has 52694 observations and 3 variables.
NOTE: PROCEDURE MEANS used (Total process time):
      real time           0.11 seconds
      cpu time            0.09 seconds
      

396  
397  proc means data=wip.Tv_esg_ranks_quarterly noprint;
398  by    PERMNO year   ;
399  var ALL_CATEGORIES_ADJ_INSIGHT;
400  output out=wip.Tv_esg_ranks_annual2 (drop= _type_ _freq_ )
400  mean=ALLCATEGORIES_ADJ_INSIGHT_MEAN ;
401  run;

NOTE: There were 201223 observations read from the data set WIP.TV_ESG_RANKS_QUARTERLY.
NOTE: The data set WIP.TV_ESG_RANKS_ANNUAL2 has 52694 observations and 3 variables.
NOTE: PROCEDURE MEANS used (Total process time):
      real time           0.10 seconds
      cpu time            0.11 seconds
      

402  
403  proc sql;
403            create table wip.Tv_esg_ranks_annual as
404   select a.*,b.*
405   from wip.Tv_esg_ranks_annual1 as a, wip.Tv_esg_ranks_annual2 as b
406   where a.PERMNO=b.PERMNO and a.year=b.year
407   order by a.PERMNO, a.year;
NOTE: Table WIP.TV_ESG_RANKS_ANNUAL created, with 52694 rows and 4 columns.

408  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           0.03 seconds
      cpu time            0.01 seconds
      

409  
410  *Incorporate firm identifiers (gvkey) into the TraValue Labs ESG dataset;
411  *cc_id contains the linking table between permno and gvkey obtained from the
411  CRSP/Compustat Merged database;
412  data wip.cc_id3;
413  set input.cc_id;
414  run;

NOTE: There were 326938 observations read from the data set INPUT.CC_ID.
NOTE: The data set WIP.CC_ID3 has 326938 observations and 12 variables.
NOTE: DATA statement used (Total process time):
      real time           0.11 seconds
      cpu time            0.04 seconds
      

415  
416  proc sort data=wip.cc_id3 nodupkey; by permno; run;

NOTE: There were 326938 observations read from the data set WIP.CC_ID3.
NOTE: 300130 observations with duplicate key values were deleted.
NOTE: The data set WIP.CC_ID3 has 26808 observations and 12 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.06 seconds
      cpu time            0.09 seconds
      

417  
418  proc sql;
418           create table wip.TV_ESG_ID as
419    select a.*, b.*
420    from wip.Tv_esg_ranks_annual a, wip.cc_id3 b
421    where a.permno=b.permno
422    order by b.permno, b.crsp_year;
NOTE: Table WIP.TV_ESG_ID created, with 50339 rows and 15 columns.

423  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           0.03 seconds
      cpu time            0.01 seconds
      

424  
425  *Keep relevant variables;
426  proc sql;
427   create table wip.TV_ESG_Final as
428   select
429  year as tv_year, comp_conm, GVKEY, ALLCATEGORIES_ADJ_INSIGHT_MEAN as TV_ESG,
429  MATERIALITY_ADJ_INSIGHT_MEAN as TV_ESG_material
430  from   wip.TV_ESG_ID;
NOTE: Table WIP.TV_ESG_FINAL created, with 50339 rows and 5 columns.

431  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           0.03 seconds
      cpu time            0.01 seconds
      

432  
433  proc sort data=wip.TV_ESG_Final nodupkey; by gvkey tv_year ; run;

NOTE: There were 50339 observations read from the data set WIP.TV_ESG_FINAL.
NOTE: 488 observations with duplicate key values were deleted.
NOTE: The data set WIP.TV_ESG_FINAL has 49851 observations and 5 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.03 seconds
      cpu time            0.04 seconds
      

434  
435  data output.TV_ESG_Final;
436  set wip.TV_ESG_Final;
437  if not missing (TV_ESG);
438  run;

NOTE: There were 49851 observations read from the data set WIP.TV_ESG_FINAL.
NOTE: The data set OUTPUT.TV_ESG_FINAL has 49087 observations and 5 variables.
NOTE: DATA statement used (Total process time):
      real time           0.03 seconds
      cpu time            0.00 seconds
      

439  
440  *sas to stata data conversion;
441  proc export data=output.Bloomberg_ESG_Final outfile= "D:\stata
441  data\input\Bloomberg_ESG_Final.dta" replace;
442  run;

NOTE: The export data set has 21901 observations and 7 variables.
NOTE: "D:\stata data\input\Bloomberg_ESG_Final.dta" file was successfully created.
NOTE: PROCEDURE EXPORT used (Total process time):
      real time           0.05 seconds
      cpu time            0.01 seconds
      

443  
444  proc export data=output.Refinitiv_ESG_FINAL outfile= "D:\stata
444  data\input\Refinitiv_ESG_FINAL.dta" replace;
445  run;

NOTE: The export data set has 14235 observations and 4 variables.
NOTE: "D:\stata data\input\Refinitiv_ESG_FINAL.dta" file was successfully created.
NOTE: PROCEDURE EXPORT used (Total process time):
      real time           0.01 seconds
      cpu time            0.00 seconds
      

446  
447  proc export data=output.sustainalytics_esg_final outfile= "D:\stata
447  data\input\sustainalytics_esg_final.dta" replace;
448  run;

NOTE: The export data set has 47186 observations and 7 variables.
NOTE: "D:\stata data\input\sustainalytics_esg_final.dta" file was successfully created.
NOTE: PROCEDURE EXPORT used (Total process time):
      real time           0.03 seconds
      cpu time            0.01 seconds
      

449  
450  proc export data=output.TV_ESG_FINAL outfile= "D:\stata data\input\TV_ESG_FINAL.dta"
450  replace;
451  run;

NOTE: The export data set has 49087 observations and 5 variables.
NOTE: "D:\stata data\input\TV_ESG_FINAL.dta" file was successfully created.
NOTE: PROCEDURE EXPORT used (Total process time):
      real time           0.03 seconds
      cpu time            0.01 seconds
      

-------------------------------------------------------------------------------

. /*=========================================================================
>  Perform additional processing on ESG ratings from other ratings providers
>  =========================================================================*/
. 
. 
. global input "/Volumes/T9/stata data/input"

. global wip "/Volumes/T9/stata data/wip"

. global output "/Volumes/T9/stata data/output"

. 
. set more off

. set type double

. 
. *****************************************************************
. /*                    1 Bloomberg ESG                          */
. *****************************************************************
. 
. use "$input/bloomberg_esg_final.dta", clear

. rename bloomberg_year esg_year

. rename bloomberg_esg bloomberg_esg_pct_

. save "$output/bloomberg_esg.dta", replace
(file /Volumes/T9/stata data/output/bloomberg_esg.dta not found)
file /Volumes/T9/stata data/output/bloomberg_esg.dta saved

. 
. 
. *****************************************************************
. /*                    2 Refinitiv ESG                          */
. *****************************************************************
. 
. use "$input/refinitiv_esg_final.dta", clear

. rename refinitiv_year esg_year

. rename refinitiv_esgc refinitiv_esgc_pct_

. save "$output/refinitiv_esg.dta", replace
(file /Volumes/T9/stata data/output/refinitiv_esg.dta not found)
file /Volumes/T9/stata data/output/refinitiv_esg.dta saved

. 
. *****************************************************************
. /*                    3 Sustainalytics ESG                     */
. *****************************************************************
. 
. use "$input/sustainalytics_esg_final.dta", clear

. rename sustainalytics_year esg_year

. destring esg_year, replace
esg_year: all characters numeric; replaced as int

. rename sustainalytics_esg sustainalytics_esg_pct_

. save "$output/sustainalytics_esg.dta", replace
(file /Volumes/T9/stata data/output/sustainalytics_esg.dta not found)
file /Volumes/T9/stata data/output/sustainalytics_esg.dta saved

. 
. *****************************************************************
. /*                    4 Truvalue Labs ESG                      */
. *****************************************************************
. 
. use "$input/tv_esg_final.dta", clear

. rename tv_year esg_year

. rename tv_esg tv_esg_pct_

. save "$output/tv_esg.dta", replace
(file /Volumes/T9/stata data/output/tv_esg.dta not found)
file /Volumes/T9/stata data/output/tv_esg.dta saved

-------------------------------------------------------------------------------

. /*===========================================================================
> ===============
>  This code is to generate firm-level characteristics to be used in the empiri
> cal analysis.
>  ============================================================================
> =============*/
. 
. global input "/Volumes/T9/stata data/input"

. global wip "/Volumes/T9/stata data/wip"

. global output "/Volumes/T9/stata data/output"

. 
. set more off

. set type double

. 
. ************************************************
. /* 1   Construct firm-level control variables */
. ************************************************
. 
. *comp_all is the compustat annual file 
. use "$input/comp_all.dta", clear

. * Exclude financial services firms (SIC codes 60006999)
. drop if substr(sic,1,1)=="6"
(139,102 observations deleted)

. gen comp_year=year(datadate)

. sort gvkey comp_year datadate

. by gvkey comp_year: keep if _N==_n 
(1,277 observations deleted)

. drop if at<=1
(13,954 observations deleted)

. replace xad=0 if xad==.
(234,003 real changes made)

. 
. * Calculate relevant variables for the analysis
. gen log_assets=ln(at)
(20,948 missing values generated)

. gen roa=ib/at
(21,890 missing values generated)

. gen cash_holding=che/at
(25,624 missing values generated)

. gen dividends=(dvc+dvp)/at
(24,615 missing values generated)

. gen debt=(dltt+dlc)/at
(26,386 missing values generated)

. gen sales=sale/(emp*1000)
(54,839 missing values generated)

. gen capex=capx/sale
(41,849 missing values generated)

. gen ad=xad/sale
(26,176 missing values generated)

. gen rnd=xrd/sale
(184,880 missing values generated)

. gen rnd_dummy=1 if xrd==.
(150,226 missing values generated)

. replace rnd_dummy=0 if rnd_dummy==.
(150,226 real changes made)

. replace rnd=0 if xrd==.
(181,055 real changes made)

. sort gvkey comp_year

. by gvkey: generate age1=_n

. generate comp_age=ln(age1)

. 
. * Generate lagged values of relevant variables for analysis
. egen id=group(gvkey)

. gen t=comp_year

. xtset id t

Panel variable: id (unbalanced)
 Time variable: t, 1950 to 2018, but with gaps
         Delta: 1 unit

. gen log_assets1=ln(l.at)
(45,968 missing values generated)

. gen roa1=l.ib/l.at
(46,787 missing values generated)

. gen cash_holding1=l.che/l.at
(50,598 missing values generated)

. gen dividends1=(l.dvc+l.dvp)/l.at
(48,508 missing values generated)

. gen debt1=(l.dltt+l.dlc)/l.at
(51,191 missing values generated)

. gen sales1=l.sale/(l.emp*1000)
(76,759 missing values generated)

. gen capex1=l.capx/l.sale
(64,831 missing values generated)

. gen ad1=l.xad/l.sale
(49,745 missing values generated)

. gen rnd1=l.xrd/l.sale
(197,193 missing values generated)

. gen rnd_dummy1=1 if l.xrd==.
(137,185 missing values generated)

. replace rnd_dummy1=0 if rnd_dummy1==.
(137,185 real changes made)

. 
. * Keep relevant variables for analysis
. keep gvkey state comp_year  fyear datadate id t sic* comp_age ceq log_assets 
> roa cash_holding dividends debt  sales  capex ad rnd rnd_dummy log_assets1 ro
> a1 cash_holding1 dividends1 debt1  sales1  capex1 ad1   rnd1 rnd_dummy1    

. 
. *comp_crsp_annual contains firm-level accounting and market information based
>  on the CRSP/Compustat Merged data from WRDS.
. merge 1:1 gvkey comp_year using "$input/comp_crsp_annual.dta", keepusing(prc 
> shrout ret)

    Result                      Number of obs
    -----------------------------------------
    Not matched                       188,186
        from master                   107,844  (_merge==1)
        from using                     80,342  (_merge==2)

    Matched                           223,437  (_merge==3)
    -----------------------------------------

. keep if _merge==3
(188,186 observations deleted)

. drop _merge

. drop if ceq<0
(7,881 observations deleted)

. gen btm=1000*ceq/(abs(prc)*shrout)
(16,335 missing values generated)

. xtset id t

Panel variable: id (unbalanced)
 Time variable: t, 1950 to 2018, but with gaps
         Delta: 1 unit

. gen btm1=1000*l.ceq/(abs(l.prc)*l.shrout)
(31,120 missing values generated)

. gen ret1=l.ret
(56,840 missing values generated)

. drop  prc shrout ceq

. rename comp_year year

. drop  id t   

. 
. *Winsorize variables
. local varlist "log_assets roa cash_holding dividends debt sales  capex ad rnd
>  btm ret log_assets1 roa1 cash_holding1 dividends1 debt1 sales1  capex1 ad1 r
> nd1 btm1 ret1"

. foreach g of local varlist{
  2. winsor2 `g', c(1 99)  replace
  3. }

. save "$output/firm_var.dta", replace
(file /Volumes/T9/stata data/output/firm_var.dta not found)
file /Volumes/T9/stata data/output/firm_var.dta saved

. 
. ******************************************************
. /* 2    Construct analyst coverage variable         */
. ******************************************************
. 
. *IBES Detail file obtained from WRDS
. use "$input/IBES_detail", clear

. gen year=year(anndats)

. gen month=month(anndats)

. gen day=day(anndats)

. format analy %12.0f

. 
. *Drop observations where analyst code equals zero 
. drop if analy==0
(11,567 observations deleted)

. *Drop observations where the analyst forecast value is missing;
. drop if value==.
(778 observations deleted)

. *Keep forecast estimates denominated in U.S. dollars;
. drop if curr_act!="USD" | report_curr!="USD" | currfl=="C"
(1,040,620 observations deleted)

. order ticker analy anndats anntims 

. sort ticker analy anndats anntims 

. duplicates drop ticker analy anndats anntims,force

Duplicates in terms of ticker analys anndats anntims

(2,830,661 observations deleted)

. by ticker analy anndats : keep if _N==_n
(139,933 observations deleted)

. sort ticker year month day analy

. 
. *The total number of analysts issuing forecasts for the firm per month
. by ticker year month: gen count=_N

. order ticker year month day analy count

. duplicates drop ticker year month, force

Duplicates in terms of ticker year month

(4,163,486 observations deleted)

. 
. *Use the maximum number of analysts issuing annual forecasts in any month wit
> hin a year. 
. *Results are robust to using the annual average instead.
. by ticker year: egen analyst_coverage=max(count)

. order analyst_coverage

. duplicates drop ticker year , force

Duplicates in terms of ticker year

(1,030,073 observations deleted)

. keep ticker year analyst_coverage

. save "$wip/ibes_analyst.dta", replace
(file /Volumes/T9/stata data/wip/ibes_analyst.dta not found)
file /Volumes/T9/stata data/wip/ibes_analyst.dta saved

. 
. * Incorporate gvkey into the IBES data
. * comp_ibes_ticker is the gvkeyibes ticker linking table obtained from WRDS
. use "$input/comp_ibes_ticker.dta", clear

. drop if missing(ibtic)
(31,335 observations deleted)

. gen active=0 if secstat=="I"
(10,364 missing values generated)

. replace active=1 if secstat=="A"
(10,364 real changes made)

. sort gvkey active dldtei

. by gvkey: keep if _n==_N
(1,922 observations deleted)

. keep gvkey ibtic

. rename ibtic ticker

. merge m:m  ticker using  "$wip/ibes_analyst.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                        19,458
        from master                     8,594  (_merge==1)
        from using                     10,864  (_merge==2)

    Matched                           157,411  (_merge==3)
    -----------------------------------------

. keep if _merge==3
(19,458 observations deleted)

. drop _merge

. gen log_analyst_coverage=ln(1+analyst_coverage)

. gen t=year

. gen id=gvkey

. destring id, replace
id: all characters numeric; replaced as long

. xtset id t

Panel variable: id (unbalanced)
 Time variable: t, 1981 to 2020, but with gaps
         Delta: 1 unit

. gen log_analyst_coverage1=l.log_analyst_coverage
(20,637 missing values generated)

. replace log_analyst_coverage1=0 if log_analyst_coverage1==.
(20,637 real changes made)

. drop id t ticker

. save  "$output/ibes_analyst_coverage.dta", replace
(file /Volumes/T9/stata data/output/ibes_analyst_coverage.dta not found)
file /Volumes/T9/stata data/output/ibes_analyst_coverage.dta saved

. 
. ****************************************************
. * 3    Calculate  total institutional ownership    *
. ****************************************************
. 
. *io_timeseries is quarterly firm-level total institutional ownership, see WRD
> S for reference:
. *https://wrds-www.wharton.upenn.edu/pages/wrds-research/applications/institut
> ional-ownership-research/institutional-ownership-concentration-and-breadth-ra
> tios/
. use "$input/io_timeseries.dta", clear

. keep gvkey rdate ior

. generate year=year(rdate)

. sort gvkey year rdate

. 
. *Compute the annual average of total institutional ownership across quarters 
> for each firm;
. by gvkey year: egen ior_mean=mean(ior)

. duplicates drop gvkey year, force

Duplicates in terms of gvkey year

(779,371 observations deleted)

. drop ior

. rename ior_mean ior

. egen id=group(gvkey)

. gen t=year

. xtset id t

Panel variable: id (unbalanced)
 Time variable: t, 1925 to 2016, but with gaps
         Delta: 1 unit

. gen ior1=l.ior
(20,314 missing values generated)

. replace ior1=0 if ior1==.
(20,314 real changes made)

. keep gvkey year ior ior1

. local varlist " ior ior1"

. foreach g of local varlist{
  2. winsor2 `g', c(1 99)  replace
  3. }

. save "$output/ior_total.dta", replace
(file /Volumes/T9/stata data/output/ior_total.dta not found)
file /Volumes/T9/stata data/output/ior_total.dta saved

. 
. 
. ****************************************************
. *           4  Identify  Blue State                *
. ****************************************************
. 
. *U.S. states are classified as politically "red" or "blue" based on widely re
> cognized patterns in presidential voting outcomes. The classifications are ma
> nually compiled based on public election information.
. import excel "/Volumes/T9/Blue State/red_blue.xlsx", sheet("Sheet2") firstrow
>  clear
(3 vars, 714 obs)

. egen state_=group(state)

. xtset state_ year 

Panel variable: state_ (strongly balanced)
 Time variable: year, 2002 to 2015
         Delta: 1 unit

. gen Blue1=l.Blue
(51 missing values generated)

. drop state_

. save "$output/blue.dta", replace 
(file /Volumes/T9/stata data/output/blue.dta not found)
file /Volumes/T9/stata data/output/blue.dta saved

-------------------------------------------------------------------------------

. /*===========================================================================
> =======
>  This code is to merge all variables required for the baseline regression ana
> lysis.
>  ============================================================================
> ======*/
. 
. global input "/Volumes/T9/stata data/input"

. global wip "/Volumes/T9/stata data/wip"

. global output "/Volumes/T9/stata data/output"

. 
. set more off

. set type double

. 
. 
. 
. *********************************************************
. /*         1 Merge all the control variables           */
. *********************************************************
. 
. use "$output/firm_var.dta", clear

. merge 1:1 gvkey year using "$output/ibes_analyst_coverage.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                       160,765
        from master                   109,455  (_merge==1)
        from using                     51,310  (_merge==2)

    Matched                           106,101  (_merge==3)
    -----------------------------------------

. drop if _merge==2
(51,310 observations deleted)

. drop _merge

. merge 1:1 gvkey year using "$output/ior_total.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                       109,721
        from master                    23,846  (_merge==1)
        from using                     85,875  (_merge==2)

    Matched                           191,710  (_merge==3)
    -----------------------------------------

. drop if _merge==2
(85,875 observations deleted)

. drop _merge

. merge m:1 state year using  "$output/blue.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                       170,381
        from master                   170,375  (_merge==1)
        from using                          6  (_merge==2)

    Matched                            45,181  (_merge==3)
    -----------------------------------------

. drop if _merge==2
(6 observations deleted)

. drop _merge

. save "$output/all_control.dta", replace
file /Volumes/T9/stata data/output/all_control.dta saved

. 
. 
. ***************************************************************************
. /*    2 Merge the dependent variable with the independent variables      */
. ***************************************************************************
. 
. 
. use "$output/kld_esg.dta", clear

. rename esg_year year

. merge 1:1 year gvkey using "$output/all_large_investee.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                        41,138
        from master                    40,535  (_merge==1)
        from using                        603  (_merge==2)

    Matched                             3,408  (_merge==3)
    -----------------------------------------

. replace largest_blockholders=0 if _merge==1
(40,535 real changes made)

. replace smallest_blockholders=0 if _merge==1
(40,535 real changes made)

. replace large_investee=0 if _merge==1
(40,535 real changes made)

. replace large_investee_prior=0 if _merge==1
(40,535 real changes made)

. replace dedicated=0 if _merge==1
(40,535 real changes made)

. replace transient=0 if _merge==1
(40,535 real changes made)

. replace large_quarter=0 if _merge==1
(40,535 real changes made)

. generate multiple_hold=1 if num_hold>1
(875 missing values generated)

. replace multiple_hold=0 if multiple_hold==.
(875 real changes made)

. generate single_hold=1 if num_hold==1
(43,671 missing values generated)

. replace single_hold=0 if single_hold==.
(43,671 real changes made)

. drop if _merge==2
(603 observations deleted)

. drop rdate _merge

. merge m:m year gvkey using "$output/all_control.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                       193,641
        from master                    11,014  (_merge==1)
        from using                    182,627  (_merge==2)

    Matched                            32,929  (_merge==3)
    -----------------------------------------

. keep if _merge==3
(193,641 observations deleted)

. drop _merge

. rename debt leverage

. rename debt1 leverage1

. rename comp_age age

. rename large_investee sister

. 
. *Label variables
. label variable kld_esg "ESG ratings"

. label variable sister "Sister"

. label variable log_assets "Total Assets"

. label variable btm "Book-to-Market"

. label variable age "Firm Age"  

. label variable leverage "Leverage"

. label variable sales "Sales Per Employee"

. label variable roa "ROA"

. label variable ret  "Prior-Year Return" 

. label variable capex "Capital Expenditure"

. label variable rnd "R&D"

. label variable rnd_dummy "R&D Missing Dummy"

. label variable dividends "Dividends"

. label variable cash_holding "Cash Holdings"

. label variable ad "Advertising Intensity"

. label variable ior "Institutional Ownership"

. label variable log_analyst_coverage "Analyst Coverage"

. label variable Blue "Blue-State Dummy"

. save "$output/regression_sample.dta", replace
file /Volumes/T9/stata data/output/regression_sample.dta saved

-------------------------------------------------------------------------------

. /*===========================================================================
> ======
>  *            Difference-in-Differences (DiD) Sample Construction            
>      *
>  *     An  event window of one year before and after the rater-ownership chan
> ge   *
>  ============================================================================
> =====*/
. 
. global input "/Volumes/T9/stata data/input"

. global wip "/Volumes/T9/stata data/wip"

. global output "/Volumes/T9/stata data/output"

. 
. set more off

. set type double

. 
. ***********************************************
. /*    Step1: Identify large shareholders     */
. ***********************************************
. 
. *To align the timing of ownership and ESG information, we lag the ownership d
> ata by six months
. *Results are robust without the lag
. use "$input/ownership_final.dta", clear

. gen effective_date= dofm(mofd(rdate) - 6) + day(rdate) - 2

. format effective_date %td

. drop rdate

. rename effective_date rdate

. save "$input/ownership_final1.dta", replace
file /Volumes/T9/stata data/input/ownership_final1.dta saved

. 
. *Step1.1 Identify quarterly ownership stakes in the rater by institutional in
> vestors
. use "$input/ownership_final1.dta", clear

. keep if gvkey=="178507"
(61,047,168 observations deleted)

. 
. *Keep only ownership information for the years 2009-2010;
. keep if rdate<=date("29dec2010","DMY") & rdate>=date("29mar2009","DMY")
(4,815 observations deleted)

. gen large_shareholder=1 if ior_a>0.05
(1,535 missing values generated)

. replace large_shareholder=0 if large_shareholder==.
(1,535 real changes made)

. save "$wip/rater_shareholder.dta", replace
(file /Volumes/T9/stata data/wip/rater_shareholder.dta not found)
file /Volumes/T9/stata data/wip/rater_shareholder.dta saved

. 
. *Step1.2 Keep institutional investors classified as large shareholders
. use "$wip/rater_shareholder.dta", clear

. keep if large_shareholder==1
(1,535 observations deleted)

. drop ior_a ior_b

. save "$wip/rater_large_shareholder.dta", replace
(file /Volumes/T9/stata data/wip/rater_large_shareholder.dta not found)
file /Volumes/T9/stata data/wip/rater_large_shareholder.dta saved

. 
. *Step1.3 Create a list of large shareholders 
. use "$wip/rater_large_shareholder.dta", clear

. sort  mgrno rdate

. by mgrno : gen large_quarter=_N

. 
. *Keep large shareholders that appear in both the pre- and post-rater-ownershi
> p change periods
. keep if large_quarter>4
(3 observations deleted)

. duplicates drop mgrno, force

Duplicates in terms of mgrno

(19 observations deleted)

. keep mgrno mgrname  large_quarter

. save "$wip/rater_blockholder_list.dta", replace
(file /Volumes/T9/stata data/wip/rater_blockholder_list.dta not found)
file /Volumes/T9/stata data/wip/rater_blockholder_list.dta saved

. 
. ************************************************
. /*       Step2: Identify Treatment Firms      */
. ************************************************
. 
. *Step 2.1 Identify investee firms held by these large shareholders
. use "$wip/rater_blockholder_list.dta", clear

. merge 1:m mgrno using "$input/ownership_final1.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                    60,533,527
        from master                         0  (_merge==1)
        from using                 60,533,527  (_merge==2)

    Matched                           520,016  (_merge==3)
    -----------------------------------------

. keep if  _merge==3
(60,533,527 observations deleted)

. drop _merge

. 
. *Define large investees as investees making up 0.25% of a large shareholder'
> s portfolio, following Kedia et al. (2017)
. *0.25% is the 75th percentile in 13F universe
. gen large75_=1 if ior_b>=0.0025
(483,805 missing values generated)

. replace large75_=0 if large75_==.
(483,805 real changes made)

. sort mgrno gvkey rdate

. gen year=year(rdate)

. sort mgrno gvkey  year rdate

. by mgrno gvkey year: gen each_quarter_=_N

. by mgrno gvkey year: egen min_large75_=min(large75_)

. 
. *Step 2.2 Identify firm-year observations for treated firms
. gen treated1=1 if each_quarter_==4&min_large75_==1
(497,680 missing values generated)

. replace treated1=0 if treated1==.
(497,680 real changes made)

. drop large75_  each_quarter_ min_large75_ 

. keep if treated1==1
(497,680 observations deleted)

. rename treated1 treated

. sort gvkey rdate  large_quarter

. order gvkey rdate mgrno  large_quarter  

. 
. *If multiple large shareholders hold a treated firm on the same day, we retai
> n the one 
. *who has held the rater as a large shareholder for the longest period (in qua
> rters).
. by gvkey rdate : keep if _N==_n
(2,612 observations deleted)

. *Keep the last available observation for each treated firm in a year
. sort  gvkey  year  rdate 

. by gvkey year  : keep if _n==_N
(14,793 observations deleted)

. keep gvkey year treated 

. 
. *Keep treated sample for the years 20092010
. keep if year<=2010 & year>=2009
(4,591 observations deleted)

. save "$wip/DID_sample_full.dta", replace
(file /Volumes/T9/stata data/wip/DID_sample_full.dta not found)
file /Volumes/T9/stata data/wip/DID_sample_full.dta saved

. 
. *Keep treated firms that are large investees of large shareholders in both th
> e year before and the year after the rater ownership change
. use "$wip/DID_sample_full.dta", clear

. sort gvkey year

. by gvkey: gen count=_N

. keep if count==2
(98 observations deleted)

. drop count

. save "$output/DID_sample.dta", replace
file /Volumes/T9/stata data/output/DID_sample.dta saved

. 
. *Create a list of large investee firms that were added to the rater's large s
> hareholders' portfolios after the rater ownership change
. use "$wip/DID_sample_full", clear

. sort gvkey year

. by gvkey: gen count=_N

. keep if count==1
(242 observations deleted)

. drop count

. keep if year==2010
(26 observations deleted)

. save "$output/DID_sample_fiterout.dta", replace
file /Volumes/T9/stata data/output/DID_sample_fiterout.dta saved

-------------------------------------------------------------------------------

. /*===========================================================================
> ======
>  *            Difference-in-Differences (DiD) Sample Construction            
>      *
>  *   An  event window of two years before and after the rater-ownership chang
> e    *
>  ============================================================================
> =====*/
. 
. global input "/Volumes/T9/stata data/input"

. global wip "/Volumes/T9/stata data/wip"

. global output "/Volumes/T9/stata data/output"

. 
. set more off

. set type double

. 
. ***********************************************
. /*    Step1: Identify large shareholders     */
. ***********************************************
. 
. *To align the timing of ownership and ESG information, we lag the ownership d
> ata by six months
. *Results are robust without the lag
. use "$input/ownership_final.dta", clear

. gen effective_date= dofm(mofd(rdate) - 6) + day(rdate) - 2

. format effective_date %td

. drop rdate

. rename effective_date rdate

. save "$input/ownership_final1.dta", replace
file /Volumes/T9/stata data/input/ownership_final1.dta saved

. 
. *Step1.1 Identify quarterly ownership stakes in the rater by institutional in
> vestors
. use "$input/ownership_final1.dta", clear

. keep if gvkey=="178507"
(61,047,168 observations deleted)

. 
. *Keep only ownership information for the years 2008-2011;
. keep if rdate<=date("29dec2011","DMY") & rdate>=date("29mar2008","DMY")
(3,563 observations deleted)

. gen large_shareholder=1 if ior_a>0.05
(2,754 missing values generated)

. replace large_shareholder=0 if large_shareholder==.
(2,754 real changes made)

. save "$wip/rater_shareholder.dta", replace
file /Volumes/T9/stata data/wip/rater_shareholder.dta saved

. 
. *Step1.2 Keep institutional investors classified as large shareholders
. use "$wip/rater_shareholder.dta", clear

. keep if large_shareholder==1
(2,754 observations deleted)

. drop ior_a ior_b

. save "$wip/rater_large_shareholder.dta", replace
file /Volumes/T9/stata data/wip/rater_large_shareholder.dta saved

. 
. *Step1.3 Create a list of large shareholders 
. use "$wip/rater_large_shareholder.dta", clear

. sort  mgrno rdate

. by mgrno : gen large_quarter=_N

. 
. *Keep large shareholders that appear in both the two years before and the two
>  years after the rater-ownership change
. keep if large_quarter>12
(28 observations deleted)

. duplicates drop mgrno, force

Duplicates in terms of mgrno

(28 observations deleted)

. keep mgrno mgrname  large_quarter

. save "$wip/rater_blockholder_list.dta", replace
file /Volumes/T9/stata data/wip/rater_blockholder_list.dta saved

. 
. ************************************************
. /*       Step2: Identify Treatment Firms      */
. ************************************************
. 
. *Step 2.1 Identify investee firms held by these large shareholders
. use "$wip/rater_blockholder_list.dta", clear

. merge 1:m mgrno using "$input/ownership_final1.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                    60,818,239
        from master                         0  (_merge==1)
        from using                 60,818,239  (_merge==2)

    Matched                           235,304  (_merge==3)
    -----------------------------------------

. keep if  _merge==3
(60,818,239 observations deleted)

. drop _merge

. 
. *Define large investees as investees making up 0.25% of a large shareholder'
> s portfolio, following Kedia et al. (2017)
. *0.25% is the 75th percentile in 13F universe
. gen large75_=1 if ior_b>=0.0025
(210,914 missing values generated)

. replace large75_=0 if large75_==.
(210,914 real changes made)

. sort mgrno gvkey  rdate

. gen year=year(rdate)

. sort mgrno gvkey  year rdate

. by mgrno gvkey year: gen each_quarter_=_N

. by mgrno gvkey year: egen min_large75_=min(large75_)

. 
. *Step 2.2 Identify firm-year observations for treated firms
. gen treated1=1 if each_quarter_==4&min_large75_==1
(219,008 missing values generated)

. replace treated1=0 if treated1==.
(219,008 real changes made)

. drop large75_   each_quarter_ min_large75_  

. keep if treated1==1
(219,008 observations deleted)

. rename treated1 treated

. sort gvkey rdate  large_quarter

. order gvkey rdate mgrno  large_quarter  

. 
. *If multiple large shareholders hold a treated firm on the same day, we retai
> n the one 
. *who has held the rater as a large shareholder for the longest period (in qua
> rters).
. by gvkey rdate : keep if _N==_n
(140 observations deleted)

. *Keep the last available observation for each treated firm in a year
. sort  gvkey  year  rdate 

. by gvkey year  : keep if _n==_N
(12,117 observations deleted)

. keep gvkey year treated 

. 
. *Keep treated sample for the years 20082011
. keep if year<=2011 & year>=2008
(3,431 observations deleted)

. save "$wip/DID_sample_full_2year.dta", replace
(file /Volumes/T9/stata data/wip/DID_sample_full_2year.dta not found)
file /Volumes/T9/stata data/wip/DID_sample_full_2year.dta saved

. 
. *Keep treated firms that are large investees of large shareholders in both th
> e year before and the year after the rater-ownership change
. use "$wip/DID_sample_full_2year.dta", clear

. sort gvkey year

. by gvkey: gen count=_N

. keep if count==4
(320 observations deleted)

. drop count

. save "$output/DID_sample_2year.dta", replace
file /Volumes/T9/stata data/output/DID_sample_2year.dta saved

. 
. *Create a list of large investee firms that were added to the rater's large s
> hareholders' portfolios after the rater-ownership change
. use "$wip/DID_sample_full_2year.dta", clear

. sort gvkey year

. by gvkey: gen count=_N

. keep if count==1
(525 observations deleted)

. drop count

. keep if year==2010|year==2011
(39 observations deleted)

. save "$output/DID_sample_fiterout1_2year.dta", replace
file /Volumes/T9/stata data/output/DID_sample_fiterout1_2year.dta saved

. 
. use "$wip/DID_sample_full_2year.dta", clear

. sort gvkey year

. by gvkey: gen count=_N

. keep if count==2
(506 observations deleted)

. by gvkey: egen sum=sum(year)

. drop count

. keep if sum==4021
(52 observations deleted)

. drop sum

. save "$output/DID_sample_fiterout2_2year.dta", replace
file /Volumes/T9/stata data/output/DID_sample_fiterout2_2year.dta saved

. 
. use "$output/DID_sample_fiterout1_2year.dta", clear

. append using "$output/DID_sample_fiterout2_2year.dta"

. save "$output/DID_sample_fiterout_2year.dta", replace
file /Volumes/T9/stata data/output/DID_sample_fiterout_2year.dta saved

-------------------------------------------------------------------------------

. /*=======================================================================
>  *       This code calculates measures of ESG rating disagreement       *
>  =======================================================================*/
. 
. global input "/Volumes/T9/stata data/input"

. global wip "/Volumes/T9/stata data/wip"

. global output "/Volumes/T9/stata data/output"

. 
. set more off

. set type double

. 
. use "$output/kld_esg.dta", clear

. merge 1:1 gvkey esg_year using "$output/bloomberg_esg.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                        38,526
        from master                    30,284  (_merge==1)
        from using                      8,242  (_merge==2)

    Matched                            13,659  (_merge==3)
    -----------------------------------------

. drop _merge

. merge 1:1 gvkey esg_year using "$output/refinitiv_esg.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                        39,324
        from master                    38,637  (_merge==1)
        from using                        687  (_merge==2)

    Matched                            13,548  (_merge==3)
    -----------------------------------------

. drop _merge

. merge 1:1 gvkey esg_year using "$output/sustainalytics_esg.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                        79,140
        from master                    42,413  (_merge==1)
        from using                     36,727  (_merge==2)

    Matched                            10,459  (_merge==3)
    -----------------------------------------

. drop _merge

. merge 1:1 gvkey esg_year using "$output/tv_esg.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                        85,644
        from master                    63,078  (_merge==1)
        from using                     22,566  (_merge==2)

    Matched                            26,521  (_merge==3)
    -----------------------------------------

. drop _merge

. 
. *Calculate the pairwise ESG rating disagreement
. gen pair_dis1=abs(refinitiv_esgc_pct_ -tv_esg_pct_)/(2^0.5)
(102,718 missing values generated)

. gen pair_dis2=abs(refinitiv_esgc_pct_ -bloomberg_esg_pct_)/(2^0.5)
(101,190 missing values generated)

. gen pair_dis3=abs(refinitiv_esgc_pct_ -sustainalytics_esg_pct_)/(2^0.5)
(103,866 missing values generated)

. gen pair_dis4=abs(tv_esg_pct_ -bloomberg_esg_pct_)/(2^0.5)
(94,584 missing values generated)

. gen pair_dis5=abs(tv_esg_pct_ -sustainalytics_esg_pct_)/(2^0.5)
(101,461 missing values generated)

. gen pair_dis6=abs(bloomberg_esg_pct_ -sustainalytics_esg_pct_)/(2^0.5)
(102,850 missing values generated)

. gen pair_dis7=abs(kld_esg_pct_-refinitiv_esgc_pct_)/(2^0.5)
(103,164 missing values generated)

. gen pair_dis8=abs(kld_esg_pct_ -tv_esg_pct_)/(2^0.5)
(94,300 missing values generated)

. gen pair_dis9=abs(kld_esg_pct_ -bloomberg_esg_pct_)/(2^0.5)
(98,506 missing values generated)

. gen pair_dis10=abs(kld_esg_pct_ -sustainalytics_esg_pct_)/(2^0.5)
(106,043 missing values generated)

. 
. *Take the average across all the ESG rater pairs
. local pair_dis_group1 "pair_dis1 pair_dis2 pair_dis3 pair_dis4 pair_dis5 pair
> _dis6  "

. egen disagreement_group1 = rmean(`pair_dis_group1')
(88,577 missing values generated)

. local pair_dis_group2 "pair_dis1 pair_dis2 pair_dis3 pair_dis4 pair_dis5 pair
> _dis6 pair_dis7 pair_dis8 pair_dis9 pair_dis10 "

. egen disagreement_group2 = rmean(`pair_dis_group2')
(79,718 missing values generated)

. gen id_=gvkey 

. destring id_, replace
id_: all characters numeric; replaced as long

. xtset id_ esg_year

Panel variable: id_ (unbalanced)
 Time variable: esg_year, 1991 to 2021, but with gaps
         Delta: 1 unit

. gen disagreement_group1_f1=f.disagreement_group1
(88,801 missing values generated)

. gen disagreement_group2_f1=f.disagreement_group2
(80,378 missing values generated)

. rename esg_year year

. keep gvkey year disagreement_group1 disagreement_group2 disagreement_group1_f
> 1 disagreement_group2_f1  

. save "$output/rating_disagreement.dta", replace
file /Volumes/T9/stata data/output/rating_disagreement.dta saved

-------------------------------------------------------------------------------

. /*===========================================================================
> ======
>  *               Construct three proxies for business complexity             
>      *
>  ============================================================================
> =====*/
. 
. global input "/Volumes/T9/stata data/input"

. global wip "/Volumes/T9/stata data/wip"

. global output "/Volumes/T9/stata data/output"

. 
. set more off

. set type double

. 
. ***********************************************************
. /* 1   Construct organizational complexity measures      */
. ***********************************************************
. 
. *comp_segments_hist_business_var comes from the Compustat Historical Segments
>  Data
. use "$input/comp_segments_hist_business_var.dta", clear

. *Keep only segments classified as Business Segments
. keep if stype=="BUSSEG"
(1,403,900 observations deleted)

. 
. /*Handle duplicate segment records; see WRDS Segments Support Articles below 
> for guidance:
> 
> DATADATE refers to the period being reported. In the case of seg_annfund, it 
> is the last date of the fiscal year being reported, and therefore is the vari
> able to use when linking with other company-wide fiscal year data such as FUN
> DA (but read to the end of this note to avoid double-counting or triple-count
> ing segments data in the linking process).
> 
> SRCDATE refers to a sort of "copyright date" of the data. In the case of segm
> ents data, companies typically report (e.g. in shareholders' reports) the mos
> t recent three years. This information is collected in the segments database 
> by Compustat. As a result, segments data for a given DATADATE is likely to be
>  reported 3 times, in three successive years. And of course, SRCDATE >= DATAD
> ATE always.
> 
> This is helpful because when a company re-organizes its segments, say in fisc
> al year I, it retrospectively reports years I-1 and I-2 using the new segment
> s definitions, which allows shareholders to better understand recent performa
> nce using the new scheme. Segments data would also change if a company issued
>  a restated 10-K within two years.
> 
> Presumably if a company does not change its segments classification over thre
> e years, the data in the SEG_ANNFUND should be the same for each SRCDATE (ass
> uming no company restatements occurred). WRDS has not tested this conjecture.
> 
> The upshot for the researcher is to use only one SRCDATE for each DATADATE of
>  interest, to avoid double- or triple-counting. If you want to see segments d
> ata using the most recent organization structure, always choose the highest S
> RCDATE for a given DATADATE. 
> */
. sort gvkey sid datadate srcdate

. by gvkey sid datadate: keep if _N==_n
(525,249 observations deleted)

. sort gvkey datadate srcdate sid

. gen year=year(datadate)

. sort gvkey sid year datadate

. by gvkey sid year: keep if _N==_n 
(1,175 observations deleted)

. 
. *Calculate the number of reported industry segments by each firm in a given y
> ear
. drop if sics1==""
(56,200 observations deleted)

. sort gvkey year sics1

. duplicates drop gvkey year sics1, force

Duplicates in terms of gvkey year sics1

(77,006 observations deleted)

. by gvkey year: gen num_segment_industries=_N

. duplicates drop gvkey year, force

Duplicates in terms of gvkey year

(164,871 observations deleted)

. keep gvkey year num_segment_industries

. save "$output/organization_segments.dta", replace
file /Volumes/T9/stata data/output/organization_segments.dta saved

. 
. *******************************************************************
. /* 2        Construct geographical complexity measures           */
. *******************************************************************
. 
. use "$input/comp_segments_hist_business_var.dta", clear

. *Keep only segments classified as Geographic Segment
. keep if stype=="GEOSEG"
(1,312,612 observations deleted)

. sort gvkey sid datadate srcdate

. by gvkey sid datadate: keep if _N==_n
(538,760 observations deleted)

. sort gvkey datadate srcdate sid

. gen year=year(datadate)

. sort gvkey sid year datadate

. by gvkey sid year: keep if _N==_n 
(1,632 observations deleted)

. 
. *Calculate the number of reported geographical segments by each firm in a giv
> en year
. sort gvkey year sid

. by gvkey year: gen num_segments=_N

. duplicates drop gvkey year, force

Duplicates in terms of gvkey year

(412,990 observations deleted)

. keep gvkey year num_segments 

. save "$output/geographical_segments.dta", replace
file /Volumes/T9/stata data/output/geographical_segments.dta saved

. 
. ************************************************************************
. /* 3 Construct a firm complexity measure based on  10-K filing size   */
. ************************************************************************
. 
. *Import firm complexity data
. *firm complexity data comes from the Software Repository for Accounting and F
> inance at the University of Notre Dame
. set excelxlsxlargefile on

. import excel "/Volumes/T9/Firm Complexity/Loughran_McDonald_Complexity.xlsx",
>  sheet("Loughran_McDonald_Complexity") firstrow clear
(4 vars, 120,994 obs)

. tostring filingdate, replace format(%20.0f)
filingdate was long now str8

. gen filingdate_ = date(filingdate,"YMD")

. format filingdate_ %td

. drop  filingdate

. rename filingdate_ filingdate

. gen year = year(filingdate)

. gen month= month(filingdate)

. sort cik year filingdate

. by cik year: keep if _N==_n
(336 observations deleted)

. save "$output/10k_firm_complexity.dta", replace
file /Volumes/T9/stata data/output/10k_firm_complexity.dta saved

. 
. *Incorporate gvkey into the firm complexity data 
. *comp_gvkey_cik_link is a gvkey-cik linking table from Compustat
. use "$output/10k_firm_complexity.dta", clear

. merge 1:1  cik year using "$input/comp_gvkey_cik_link.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                       340,364
        from master                     6,447  (_merge==1)
        from using                    333,917  (_merge==2)

    Matched                           114,211  (_merge==3)
    -----------------------------------------

. keep if _merge==3
(340,364 observations deleted)

. drop _merge

. * If the 10-K filing date is before July, treat it as reflecting the previous
>  fiscal year
. * If filed in July or later, treat it as reflecting the current year's financ
> ials
. gen year_=year-1 if month<=6
(20,739 missing values generated)

. replace year_=year if year_==.
(20,739 real changes made)

. drop year

. rename year_ year

. sort cik year filingdate

. by cik year: keep if _N==_n
(478 observations deleted)

. save "$output/10k_firm_complexity_.dta", replace
file /Volumes/T9/stata data/output/10k_firm_complexity_.dta saved

-------------------------------------------------------------------------------

2    ******************************************************************************************
2    ****************************
3    This code calculates the market value of each firm held in the portfolios of the ESG
3    raters large shareholders,
4    and then aggregates these values across firms to obtain the total assets under management
4    for each large shareholder.
5    *The code is adapted from:
6    *Luis Palacios, Rabih Moussawi, and Denys Glushkov (2009), available through WRDS Research
6     Applications.
7    *Modifications were made for our sample coverage and definitions
8    ******************************************************************************************
8    ****************************;
9    
10   libname input 'D:\sas data\input';
NOTE: Libref INPUT was successfully assigned as follows: 
      Engine:        V9 
      Physical Name: D:\sas data\input
11   libname wip 'D:\sas data\wip';
NOTE: Libref WIP was successfully assigned as follows: 
      Engine:        V9 
      Physical Name: D:\sas data\wip
12   libname output 'D:\sas data\output';
NOTE: Libref OUTPUT was successfully assigned as follows: 
      Engine:        V9 
      Physical Name: D:\sas data\output
13   
14   /* Step1. Specifying options */
15   *Adjust share and price in monthly data;
16   *and since Thomson 13F is quarterly (FDATE & RDATE);
17   *align CRSP month-end dates and keep quarter ends;
18   *crsp_m is the CRSP monthly data file;
19   data wip.crsp_m; format QDATE date9.;
20   set input.crsp_m;
21   QDATE = INTNX('QTR',date,0,'E');
22   DATE = INTNX("MONTH",date,0,"E");
23   P = abs(prc)/cfacpr;
24   TSO=shrout*cfacshr*1000;
25   if TSO<=0 then TSO=.;
26   ME = P*TSO/1000000;
27   drop ncusip prc cfacpr shrout exchcd shrcd ret;
28   format ret percentn8.4 ME P dollar12.3 TSO comma12.;
29   run;

NOTE: There were 3282906 observations read from the data set INPUT.CRSP_M.
NOTE: The data set WIP.CRSP_M has 3282906 observations and 7 variables.
NOTE: DATA statement used (Total process time):
      real time           1.72 seconds
      cpu time            1.45 seconds
      

30   
31   *Keep last monthly observation for each quarter;
32   data wip.crsp_m;
33   set wip.crsp_m;
34   by permno qdate date;
35   if last.qdate;
36   drop date;
37   run;

NOTE: There were 3282906 observations read from the data set WIP.CRSP_M.
NOTE: The data set WIP.CRSP_M has 1112338 observations and 6 variables.
NOTE: DATA statement used (Total process time):
      real time           0.37 seconds
      cpu time            0.31 seconds
      

38   
39   *Keep only institutional investors who are large shareholders of the ESG rater;
40   *s34type1 is from Thomson Reuters on WRDS;
41   data wip.s34type1_;
42   set input.s34type1;
43   where MGRNO=27800 or MGRNO=63050 or MGRNO=22620 or MGRNO=9385 or MGRNO=48170 or
43   MGRNO=91910 or MGRNO=10511 or MGRNO=8000 or MGRNO=90457 or MGRNO=12497 or MGRNO=11307 or
43   MGRNO=6125 or MGRNO=71110 or MGRNO=58950;
44   run;

NOTE: There were 1425 observations read from the data set INPUT.S34TYPE1.
      WHERE MGRNO in (6125, 8000, 9385, 10511, 11307, 12497, 22620, 27800, 48170, 58950, 
      63050, 71110, 90457, 91910);
NOTE: The data set WIP.S34TYPE1_ has 1425 observations and 8 variables.
NOTE: DATA statement used (Total process time):
      real time           0.03 seconds
      cpu time            0.03 seconds
      

45   
46   /* Step2. Merge TR-13F S34type1 and S34type3 sets */
47   *First, keep first vintage with holdings data for each RDATE-MGRNO combinations;
48   proc sql;
49   create table wip.First_Vint
50   as select distinct rdate, fdate, mgrno, mgrname,TYPECODE
51   from wip.s34type1_
52   group by mgrno, rdate
53   having fdate=min(fdate)
54   order by mgrno, rdate;
NOTE: The query requires remerging summary statistics back with the original data.
NOTE: Table WIP.FIRST_VINT created, with 1373 rows and 5 columns.

55   quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           0.03 seconds
      cpu time            0.03 seconds
      

56   
57   *Marker for first and last quarters of reporting & reporting gaps;
58   data wip.First_Vint;
59   set wip.First_Vint;
60   by mgrno rdate;
61   length First_Report 3;
62   First_Report = (first.mgrno or intck("QTR",lag(rdate),rdate)>1);
63   run;
  
NOTE: There were 1373 observations read from the data set WIP.FIRST_VINT.
NOTE: The data set WIP.FIRST_VINT has 1373 observations and 6 variables.
NOTE: DATA statement used (Total process time):
      real time           0.02 seconds
      cpu time            0.03 seconds
      

64   
65   *Last report by institutional manager, or missing 13F reports in the next quarter(s);
66   proc sort data=wip.First_Vint nodupkey; by mgrno descending rdate; run;

NOTE: There were 1373 observations read from the data set WIP.FIRST_VINT.
NOTE: 0 observations with duplicate key values were deleted.
NOTE: The data set WIP.FIRST_VINT has 1373 observations and 6 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.01 seconds
      cpu time            0.00 seconds
      

67   
68   data wip.First_Vint;
69   set wip.First_Vint;
70   by mgrno descending rdate;
71   length Last_Report 3;
72   Last_Report = (first.mgrno or intck("QTR",rdate,lag(rdate))>1);
73   run;

NOTE: There were 1373 observations read from the data set WIP.FIRST_VINT.
NOTE: The data set WIP.FIRST_VINT has 1373 observations and 7 variables.
NOTE: DATA statement used (Total process time):
      real time           0.01 seconds
      cpu time            0.00 seconds
      

74   
75   *Add total number of 13F filers during each quarter;
76   proc sql undo_policy=none;
77   create table wip.First_Vint
78   as select distinct *, count(mgrno) as NumInst
79   from wip.First_Vint
80   group by rdate
81   order by fdate, mgrno;
NOTE: The query requires remerging summary statistics back with the original data.
NOTE: Table WIP.FIRST_VINT created, with 1373 rows and 8 columns.

82   quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           0.03 seconds
      cpu time            0.03 seconds
      

83   
84   *Extract Holdings and Adjust Shares Held
85   *FDATE -Vintage date- is used in shares' adjustment;
86   *s34type3 is from Thomson Reuters on WRDS;
87   proc sql;
87             create table wip.Holdings_v1  as
88    select
88   a.RDATE,a.FDATE,a.MGRNO,a.First_Report,a.Last_Report,a.NumInst,a.TYPECODE,a.MGRNAME,b.CUSI
88   P,b.SHARES
89    from wip.First_Vint as a, input.s34type3 as b
90    where a.fdate=b.fdate and a.mgrno=b.mgrno and shares>0
91    order by fdate, mgrno;
NOTE: Table WIP.HOLDINGS_V1 created, with 1843708 rows and 10 columns.

92   quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           8.55 seconds
      cpu time            8.67 seconds
      

93   
94   /* Step3. Map TR-13F's historical CUSIP to CRSP unique identifier PERMNO */
95   *Keep securities in CRSP common stock universe;
96   *crsp_msenames is the CRSP monthly event file;
97   proc sql;
98   create table wip.Holdings_v2 as
99   select distinct  a.rdate, a.fdate, a.mgrno, a.NumInst,a.MGRNAME,
100          a.first_report, a.last_report, b.permno, a.shares,a.TYPECODE
101  from wip.Holdings_v1 as a,
102     (select distinct ncusip, permno from input.crsp_msenames
103      where not missing(ncusip)) as b
104      where a.cusip=b.ncusip;
NOTE: Table WIP.HOLDINGS_V2 created, with 1830528 rows and 10 columns.

105  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           1.14 seconds
      cpu time            1.67 seconds
      

106  
107  /* Step4. Adjust shares using CRSP adjustment factors aligned at vintage dates */
108  proc sql;
109  create table wip.Holdings as
110  select distinct a.rdate,a.fdate, a.mgrno, a.MGRNAME,a.NumInst, a.first_report,
110  a.last_report,
111        a.permno, a.shares*b.cfacshr as shares_adj label = "Adjusted Shares Held",a.shares,
111  a.TYPECODE,b.p,b.me
112  from wip.Holdings_v2 as a, wip.crsp_m as b
113  where a.permno=b.permno and a.fdate = b.qdate;
NOTE: Table WIP.HOLDINGS created, with 1827878 rows and 13 columns.

114  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           1.53 seconds
      cpu time            2.04 seconds
      

115  
116  proc sort data=wip.Holdings nodupkey; by permno rdate mgrno; run;

NOTE: There were 1827878 observations read from the data set WIP.HOLDINGS.
NOTE: 11 observations with duplicate key values were deleted.
NOTE: The data set WIP.HOLDINGS has 1827867 observations and 13 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.50 seconds
      cpu time            0.71 seconds
      

117  
118  proc sort data=wip.crsp_m   nodupkey; by permno qdate;       run;

NOTE: There were 1112338 observations read from the data set WIP.CRSP_M.
NOTE: 0 observations with duplicate key values were deleted.
NOTE: The data set WIP.CRSP_M has 1112338 observations and 6 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.18 seconds
      cpu time            0.26 seconds
      

119  
120  /* Step 5: Calculate the market value of each firm held in the portfolios of the ESG
120  raters large shareholders */
121  data wip.holding1;
122  merge wip.Holdings(in=a) wip.crsp_m (in=b rename=(qdate=rdate));
123  by permno rdate;
124  if b and TSO>0;
125  MV=shares_adj*p/1000000;
126  drop CFACSHR;
127  run;

NOTE: There were 1827867 observations read from the data set WIP.HOLDINGS.
NOTE: There were 1112338 observations read from the data set WIP.CRSP_M.
NOTE: The data set WIP.HOLDING1 has 2326776 observations and 15 variables.
NOTE: DATA statement used (Total process time):
      real time           0.93 seconds
      cpu time            0.53 seconds
      

128  
129  proc sort data=wip.holding1 nodupkey; by permno rdate mgrno; run;

NOTE: There were 2326776 observations read from the data set WIP.HOLDING1.
NOTE: 0 observations with duplicate key values were deleted.
NOTE: The data set WIP.HOLDING1 has 2326776 observations and 15 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.76 seconds
      cpu time            0.84 seconds
      

130  
131  /* Step 6: Aggregates the market values across firms to obtain the total assets under
131  management for each large shareholder. */
132  proc sort data=wip.holding1 ; by mgrno rdate; run;

NOTE: There were 2326776 observations read from the data set WIP.HOLDING1.
NOTE: The data set WIP.HOLDING1 has 2326776 observations and 15 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.75 seconds
      cpu time            1.01 seconds
      

133  
134  proc means   data=wip.holding1 noprint ;
135  by mgrno rdate ;
136  var mv;
137  output out=wip.m  sum=  /autoname;
138  run;

NOTE: There were 2326776 observations read from the data set WIP.HOLDING1.
NOTE: The data set WIP.M has 1525 observations and 5 variables.
NOTE: PROCEDURE MEANS used (Total process time):
      real time           0.24 seconds
      cpu time            0.23 seconds
      

139  
140  proc sql;
141  create table wip.holding2 as
142  select distinct a.*,b.MV_Sum
143  from wip.holding1 as a, wip.m as b
144  where a.mgrno=b.mgrno and a.rdate = b.rdate;
NOTE: Table WIP.HOLDING2 created, with 2326776 rows and 16 columns.

145  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           1.71 seconds
      cpu time            2.61 seconds
      

146  
147  *Keep relevant variables;
148  proc sql;
149   create table wip.holding_final as
150   select  RDATE,MGRNO, MGRNAME,PERMNO, TYPECODE, MV, MV_SUM
151  from wip.holding2;
NOTE: Table WIP.HOLDING_FINAL created, with 2326776 rows and 7 columns.

152  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           0.50 seconds
      cpu time            0.25 seconds
      

153  
154  *Incorporate firm identifiers (gvkey) into the holding dataset;
155  *link_permno_gvkey is the linking table between permno and gvkey obtained from the
155  CRSP/Compustat Merged database;
156  proc sql;
157  create table wip.ownership_final_block as
158  select distinct a.*,b.gvkey, b.comp_conm
159  from wip.holding_final as a, input.link_permno_gvkey  as b
160  where a.permno=b.permno
161  order by mgrno, rdate, gvkey, permno;
NOTE: Table WIP.OWNERSHIP_FINAL_BLOCK created, with 2267817 rows and 9 columns.

162  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           1.60 seconds
      cpu time            2.11 seconds
      

163  
164  proc sort data=wip.ownership_final_block  nodupkey out=output.ownership_final_block; by
164! mgrno rdate gvkey  ; run;

NOTE: There were 2267817 observations read from the data set WIP.OWNERSHIP_FINAL_BLOCK.
NOTE: 10749 observations with duplicate key values were deleted.
NOTE: The data set OUTPUT.OWNERSHIP_FINAL_BLOCK has 2257068 observations and 9 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.80 seconds
      cpu time            0.79 seconds
      

165  
166  *sas to stata data conversion;
167  proc export data=output.ownership_final_block outfile= "D:\stata
167  data\input\ownership_final_block.dta" replace;
168  run;

NOTE: The export data set has 2257068 observations and 9 variables.
NOTE: "D:\stata data\input\ownership_final_block.dta" file was successfully created.
NOTE: PROCEDURE EXPORT used (Total process time):
      real time           0.88 seconds
      cpu time            0.48 seconds
      
-------------------------------------------------------------------------------

. /*===========================================================================
> ======
>  *    The scale of the large shareholders (measured by assets under managemen
> t)   *
>  ============================================================================
> =====*/
. 
. global input "/Volumes/T9/stata data/input"

. global wip "/Volumes/T9/stata data/wip"

. global output "/Volumes/T9/stata data/output"

. 
. set more off

. set type double

. 
. use "$input/ownership_final_block.dta", clear

. drop if missing(mgrno)
(466,217 observations deleted)

. gen year=year(rdate)

. duplicates drop mgrno rdate, force

Duplicates in terms of mgrno rdate

(1,789,478 observations deleted)

. sort mgrno year rdate

. by mgrno year:egen aum=mean(mv_sum)

. keep mgrno mgrname year aum

. duplicates drop mgrno year, force

Duplicates in terms of mgrno year

(1,012 observations deleted)

. keep if year>=2010&year<=2015
(279 observations deleted)

. *Create large_scale (small_scale) dummy: 1 if AUM is  (or <) the annual medi
> an among all large shareholders; 0 otherwise;
. sort year  aum

. by year: egen aum_median=median(aum)

. gen scale_large=1 if aum>=aum_median
(40 missing values generated)

. replace scale_large=0 if scale_large==.
(40 real changes made)

. gen scale_small=1 if aum<aum_median
(42 missing values generated)

. replace scale_small=0 if scale_small==.
(42 real changes made)

. save "$output/blockholder_aum.dta",replace
file /Volumes/T9/stata data/output/blockholder_aum.dta saved

-------------------------------------------------------------------------------

2    ******************************************************************************************
2    ******
3    This code is to extract changes in holdings for the ESG raters large shareholders;
4    ******************************************************************************************
4    ******;
5    
6    libname input 'D:\sas data\input';
NOTE: Libref INPUT was successfully assigned as follows: 
      Engine:        V9 
      Physical Name: D:\sas data\input
7    libname wip 'D:\sas data\wip';
NOTE: Libref WIP was successfully assigned as follows: 
      Engine:        V9 
      Physical Name: D:\sas data\wip
8    libname output 'D:\sas data\output';
NOTE: Libref OUTPUT was successfully assigned as follows: 
      Engine:        V9 
      Physical Name: D:\sas data\output
9    
10   *Type4_change_in_holdings is from Thomson Reuters on WRDS;
11   data output.Type4_change_in_holdings_large;
12   set input.Type4_change_in_holdings;
13   where MGRNO=27800 or MGRNO=63050 or MGRNO=22620 or MGRNO=9385 or MGRNO=48170 or
13   MGRNO=91910 or MGRNO=10511 or MGRNO=8000 or MGRNO=90457 or MGRNO=12497 or MGRNO=11307 or
13   MGRNO=6125 or MGRNO=71110 or MGRNO=58950;
14   run;

NOTE: There were 2398436 observations read from the data set INPUT.TYPE4_CHANGE_IN_HOLDINGS.
      WHERE MGRNO in (6125, 8000, 9385, 10511, 11307, 12497, 22620, 27800, 48170, 58950, 
      63050, 71110, 90457, 91910);
NOTE: The data set OUTPUT.TYPE4_CHANGE_IN_HOLDINGS_LARGE has 2398436 observations and 5 
      variables.
NOTE: DATA statement used (Total process time):
      real time           2.47 seconds
      cpu time            2.32 seconds
      

15   
16   *sas to stata data conversion;
17   proc export data=output.Type4_change_in_holdings_large  outfile= "D:\stata
17   data\input\Type4_change_in_holdings_large.dta" replace;
18   run;

NOTE: The export data set has 2398436 observations and 5 variables.
NOTE: "D:\stata data\input\Type4_change_in_holdings_large.dta" file was successfully created.
NOTE: PROCEDURE EXPORT used (Total process time):
      real time           0.31 seconds
      cpu time            0.18 seconds
            
-------------------------------------------------------------------------------

. /*===========================================================================
> ======
>  *          Change in holdings by large blockholders of the ESG rater        
>      *
>  ============================================================================
> =====*/
.  
. global input "/Volumes/T9/stata data/input"

. global wip "/Volumes/T9/stata data/wip"

. global output "/Volumes/T9/stata data/output"

. 
. use "$input/Type4_change_in_holdings_large.dta", clear

. *Winsorize change in holdings variables
. local varlist " change "

. foreach g of local varlist{
  2. winsor2 `g', c(1 99)  replace
  3. }

. merge  m:1 mgrno using "$output/rater_blockholder_list.dta", keepusing(mgrno 
> largest_blockholders)

    Result                      Number of obs
    -----------------------------------------
    Not matched                             0
    Matched                         2,398,436  (_merge==3)
    -----------------------------------------

. drop _merge

. *cusip_gvkey_link is the cusip-gvkey linking table for our sample, obtained f
> rom WRDS
. merge m:1 cusip using "$input/cusip_gvkey_link.dta", keepusing(gvkey)

    Result                      Number of obs
    -----------------------------------------
    Not matched                     1,300,007
        from master                 1,299,836  (_merge==1)
        from using                        171  (_merge==2)

    Matched                         1,098,600  (_merge==3)
    -----------------------------------------

. keep if _merge==3
(1,300,007 observations deleted)

. drop _merge

. keep if largest_blockholders==1
(779,271 observations deleted)

. *To align the timing of holdings and ESG data, lag changes in holdings by six
>  months
. gen effective_date= dofm(mofd(fdate) - 6) + day(fdate) - 2

. format effective_date %td

. drop fdate

. rename effective_date fdate

. *Aggregate annual changes in holdings for a firm by each of the large blockho
> lders
. gen year=year(fdate)

. sort mgrno gvkey year fdate

. by mgrno gvkey year: egen change_sum=sum(change)

. duplicates drop mgrno gvkey year, force

Duplicates in terms of mgrno gvkey year

(227,174 observations deleted)

. *Aggregate annual changes in holdings for a firm by all large blockholders
. sort gvkey year mgrno

. by gvkey year: egen change_large=sum(change_sum)

. duplicates drop gvkey year, force

Duplicates in terms of gvkey year

(36,851 observations deleted)

. *Decrease_Holding=1 if the aggregate annual holdings of a firm by all the lar
> ge blockholders decrease in a given year, and 0 otherwise
. gen decrease_holding=1 if change_large<0
(26,531 missing values generated)

. replace decrease_holding=0 if decrease_holding==.
(26,531 real changes made)

. destring gvkey, replace
gvkey: all characters numeric; replaced as long

. xtset gvkey year

Panel variable: gvkey (unbalanced)
 Time variable: year, 1979 to 2021, but with gaps
         Delta: 1 unit

. gen decrease_holding1=f1.decrease_holding
(5,184 missing values generated)

. gen decrease_holding2=f2.decrease_holding
(9,216 missing values generated)

. save "$output/change_in_holdings.dta", replace
file /Volumes/T9/stata data/output/change_in_holdings.dta saved

-------------------------------------------------------------------------------

6    ***********************************************************************
7                This code cleans the dateset of SRI proposals
8    ***********************************************************************;
9    libname input 'D:\sas data\input';
NOTE: Libref INPUT was successfully assigned as follows: 
      Engine:        V9 
      Physical Name: D:\sas data\input
10   libname wip 'D:\sas data\wip';
NOTE: Libref WIP was successfully assigned as follows: 
      Engine:        V9 
      Physical Name: D:\sas data\wip
11   libname output 'D:\sas data\output';
NOTE: Libref OUTPUT was successfully assigned as follows: 
      Engine:        V9 
      Physical Name: D:\sas data\output
12   
13   /* Step1. Process the shareholder proposal legacy data  */
14   *Select proposals with resolution type "SRI";
15   data wip.Other_SRI_proposals;
16   set input.Other_shareholder_proposals;
17   where RES_TYPE="SRI";
18   run;

NOTE: There were 6425 observations read from the data set INPUT.OTHER_SHAREHOLDER_PROPOSALS.
      WHERE RES_TYPE='SRI';
NOTE: The data set WIP.OTHER_SRI_PROPOSALS has 6425 observations and 21 variables.
NOTE: DATA statement used (Total process time):
      real time           0.03 seconds
      cpu time            0.01 seconds
      

19   
20   *Keep relevant variables;
21   data wip.Other_SRI_proposals;
22   set wip.Other_SRI_proposals;
23   keep CN6 COMPANY ISSUE_CODE  MTG_DATE MEETING_CODE SPON_TYPE OTHER_STATUS PASSED
23   REQUIREMENT SPONSOR1  SPON_TYPE VOTE_PCT RESOLUTION;
24   run;

NOTE: There were 6425 observations read from the data set WIP.OTHER_SRI_PROPOSALS.
NOTE: The data set WIP.OTHER_SRI_PROPOSALS has 6425 observations and 12 variables.
NOTE: DATA statement used (Total process time):
      real time           0.03 seconds
      cpu time            0.01 seconds
      

25   
26   *The legacy dataset begins in 1997, but a new comprehensive Voting Analytics dataset
26   becomes available starting in 2006.
27   Therefore, we retain data from the legacy dataset up to the beginning of 2006;
28   data wip.Other_SRI_proposals;
29   set wip.Other_SRI_proposals;
30   where "01Jan1997"d<=MTG_DATE<="16Jan2006"d;
31   run;

NOTE: There were 2384 observations read from the data set WIP.OTHER_SRI_PROPOSALS.
      WHERE (MTG_DATE>='01JAN1997'D and MTG_DATE<='16JAN2006'D);
NOTE: The data set WIP.OTHER_SRI_PROPOSALS has 2384 observations and 12 variables.
NOTE: DATA statement used (Total process time):
      real time           0.03 seconds
      cpu time            0.03 seconds
      

32   
33   *Drop observations with missing meeting dates;
34   data wip.SRI_proposals_before2006;
35   set wip.Other_SRI_proposals;
36   if not missing(MTG_DATE);
37   run;

NOTE: There were 2384 observations read from the data set WIP.OTHER_SRI_PROPOSALS.
NOTE: The data set WIP.SRI_PROPOSALS_BEFORE2006 has 2384 observations and 12 variables.
NOTE: DATA statement used (Total process time):
      real time           0.01 seconds
      cpu time            0.00 seconds
      

38   
39   *Rename variables;
40   data wip.SRI_proposals_before2006;
41   set wip.SRI_proposals_before2006;
42   rename COMPANY=COMPANY_NAME;
43   rename MTG_DATE=MEETING_DATE;
44   rename SPON_TYPE=SPONSOR_TYPE;
45   rename SPONSOR1=SPONSOR_NAME;
46   rename VOTE_PCT=VOTE_FOR_PERCENTAGE;
47   rename CN6=CUSIP_6;
48   run;

NOTE: There were 2384 observations read from the data set WIP.SRI_PROPOSALS_BEFORE2006.
NOTE: The data set WIP.SRI_PROPOSALS_BEFORE2006 has 2384 observations and 12 variables.
NOTE: DATA statement used (Total process time):
      real time           0.01 seconds
      cpu time            0.00 seconds
      

49   
50   /* Step2. Process the new shareholder proposal data */
51   *Select proposals with resolution type "SRI";
52   data wip.SRI_proposals;
53   set input.Shareholder_proposals;
54   where RESOLUTION_TYPE="SRI";
55   run;

NOTE: There were 4187 observations read from the data set INPUT.SHAREHOLDER_PROPOSALS.
      WHERE RESOLUTION_TYPE='SRI';
NOTE: The data set WIP.SRI_PROPOSALS has 4187 observations and 21 variables.
NOTE: DATA statement used (Total process time):
      real time           0.03 seconds
      cpu time            0.01 seconds
      

56   
57   *Keep relevant variables;
58   data wip.SRI_proposals;
59   set wip.SRI_proposals;
60   keep CUSIP_6 COMPANY_NAME ISSUE_CODE   MEETING_DATE  SPONSOR_TYPE OTHER_STATUS
60   MEETING_CODE PASSED REQUIREMENT SPONSOR_NAME SPONSOR_TYPE VOTE_FOR_PERCENTAGE RESOLUTION;
61   run;

NOTE: There were 4187 observations read from the data set WIP.SRI_PROPOSALS.
NOTE: The data set WIP.SRI_PROPOSALS has 4187 observations and 12 variables.
NOTE: DATA statement used (Total process time):
      real time           0.01 seconds
      cpu time            0.00 seconds
      

62   
63   /* Step3. Append legacy and new SRI shareholder proposal datasets */
64   data wip.all_SRI_proposals;
65   set wip.SRI_proposals_before2006 wip.SRI_proposals;
66   run;

NOTE: There were 2384 observations read from the data set WIP.SRI_PROPOSALS_BEFORE2006.
NOTE: There were 4187 observations read from the data set WIP.SRI_PROPOSALS.
NOTE: The data set WIP.ALL_SRI_PROPOSALS has 6571 observations and 12 variables.
NOTE: DATA statement used (Total process time):
      real time           0.01 seconds
      cpu time            0.00 seconds
      

67   
68   *Generate year and month variables from meeting date;
69   Proc sort data=wip.all_SRI_proposals; by meeting_date; run;

NOTE: There were 6571 observations read from the data set WIP.ALL_SRI_PROPOSALS.
NOTE: The data set WIP.ALL_SRI_PROPOSALS has 6571 observations and 12 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.03 seconds
      cpu time            0.01 seconds
      

70   
71   data wip.all_SRI_proposals;
72   set wip.all_SRI_proposals;
73   year=year(meeting_date);
74   month=month(meeting_date);
75   run;

NOTE: There were 6571 observations read from the data set WIP.ALL_SRI_PROPOSALS.
NOTE: The data set WIP.ALL_SRI_PROPOSALS has 6571 observations and 14 variables.
NOTE: DATA statement used (Total process time):
      real time           0.02 seconds
      cpu time            0.00 seconds
      

76   
77   *Drop observations wiht missing CUSIP;
78   data wip.all_SRI_proposals;
79   set wip.all_SRI_proposals;
80   if not missing (CUSIP_6);
81   run;

NOTE: There were 6571 observations read from the data set WIP.ALL_SRI_PROPOSALS.
NOTE: The data set WIP.ALL_SRI_PROPOSALS has 6568 observations and 14 variables.
NOTE: DATA statement used (Total process time):
      real time           0.03 seconds
      cpu time            0.03 seconds
      

82   
83   /* Step4. Generate relevant variables */
84   * Generate a "success" dummy variable;
85   *Success is defined as:
86   (i) During the negotiation stage, the sponsor successfully convinces the target firm to
86   commit to changes
87   regarding the proposed issue, leading to a "successful withdrawal" of the proposal. Or
88   (ii) If no agreement is reached and the proposal proceeds to a vote at the AGM, it
88   receives majority support
89   (i.e., more than 50% of votes cast are in favor);
90   data wip.all_SRI_proposals1;
91   set wip.all_SRI_proposals;
92   If VOTE_FOR_PERCENTAGE>=50 or OTHER_STATUS="withdrawn" then do success=1;end;
93   else do success=0; end;
94   run;

NOTE: There were 6568 observations read from the data set WIP.ALL_SRI_PROPOSALS.
NOTE: The data set WIP.ALL_SRI_PROPOSALS1 has 6568 observations and 15 variables.
NOTE: DATA statement used (Total process time):
      real time           0.03 seconds
      cpu time            0.01 seconds
      

95   
96   *Generate a variable: the percentage of votes cast in favor of the proposal;
97   data  wip.all_SRI_proposals1;
98   set  wip.all_SRI_proposals1;
99   rename VOTE_FOR_PERCENTAGE=vote;
100  run;

NOTE: There were 6568 observations read from the data set WIP.ALL_SRI_PROPOSALS1.
NOTE: The data set WIP.ALL_SRI_PROPOSALS1 has 6568 observations and 15 variables.
NOTE: DATA statement used (Total process time):
      real time           0.03 seconds
      cpu time            0.01 seconds
      

101  
102  *Generate a unique ID for each SRI proposal;
103  data wip.all_SRI_proposals1;
104  set wip.all_SRI_proposals1;
105  t=1;
106  run;

NOTE: There were 6568 observations read from the data set WIP.ALL_SRI_PROPOSALS1.
NOTE: The data set WIP.ALL_SRI_PROPOSALS1 has 6568 observations and 16 variables.
NOTE: DATA statement used (Total process time):
      real time           0.03 seconds
      cpu time            0.01 seconds
      

107  
108  proc sort data=wip.all_SRI_proposals1; by cusip_6 year meeting_date t; run;

NOTE: There were 6568 observations read from the data set WIP.ALL_SRI_PROPOSALS1.
NOTE: The data set WIP.ALL_SRI_PROPOSALS1 has 6568 observations and 16 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.03 seconds
      cpu time            0.01 seconds
      

109  
110  data wip.all_SRI_proposals1;
111  set wip.all_SRI_proposals1;
112  retain id;
113  by  t;
114  if first.t then id=1;
115  else id=id+1;
116  run;

NOTE: There were 6568 observations read from the data set WIP.ALL_SRI_PROPOSALS1.
NOTE: The data set WIP.ALL_SRI_PROPOSALS1 has 6568 observations and 17 variables.
NOTE: DATA statement used (Total process time):
      real time           0.03 seconds
      cpu time            0.01 seconds
      

117  
118  data wip.all_SRI_proposals1;
119  set wip.all_SRI_proposals1;
120  drop t;
121  run;

NOTE: There were 6568 observations read from the data set WIP.ALL_SRI_PROPOSALS1.
NOTE: The data set WIP.ALL_SRI_PROPOSALS1 has 6568 observations and 16 variables.
NOTE: DATA statement used (Total process time):
      real time           0.02 seconds
      cpu time            0.01 seconds
      

122  
123  *Incorporate firm identifiers (gvkey) into the shareholder proposal data;
124  *cc_id contains the linking table between cusip and gvkey obtained from the CRSP/Compustat
124   Merged database;
125  data wip.cc_id1;
126  set input.cc_id;
127  comp_cusip6=substr(comp_cusip,1,6);
128  crsp_cusip6=substr(crsp_cusip,1,6);
129  crsp_ncusip6=substr(crsp_ncusip,1,6);
130  drop comp_cusip crsp_cusip crsp_ncusip;
131  rename comp_cusip6=comp_cusip;
132  rename crsp_cusip6=crsp_cusip;
133  rename crsp_ncusip6=crsp_ncusip;
134  run;

NOTE: There were 326938 observations read from the data set INPUT.CC_ID.
NOTE: The data set WIP.CC_ID1 has 326938 observations and 12 variables.
NOTE: DATA statement used (Total process time):
      real time           0.11 seconds
      cpu time            0.06 seconds
      

135  
136  proc sort data=wip.cc_id1 nodupkey; by crsp_year crsp_cusip; run;

NOTE: There were 326938 observations read from the data set WIP.CC_ID1.
NOTE: 14306 observations with duplicate key values were deleted.
NOTE: The data set WIP.CC_ID1 has 312632 observations and 12 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.13 seconds
      cpu time            0.12 seconds
      

137  
138  proc sql;
138            create table wip.all_SRI_proposals2 as
139    select a.*, b.gvkey
140    from wip.all_SRI_proposals1 a, wip.cc_id1 b
141    where a.cusip_6=b.comp_cusip and a.year=b.crsp_year
142    order by b.gvkey, a.year;
NOTE: Table WIP.ALL_SRI_PROPOSALS2 created, with 6015 rows and 17 columns.

143  quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           0.06 seconds
      cpu time            0.06 seconds
      

144  
145  proc sort data=wip.all_SRI_proposals2 nodupkey out=output.esg_proposal; by id; run;

NOTE: There were 6015 observations read from the data set WIP.ALL_SRI_PROPOSALS2.
NOTE: 16 observations with duplicate key values were deleted.
NOTE: The data set OUTPUT.ESG_PROPOSAL has 5999 observations and 17 variables.
NOTE: PROCEDURE SORT used (Total process time):
      real time           0.03 seconds
      cpu time            0.01 seconds
      

146  
147  *sas to stata data conversion;
148  proc export data=output.esg_proposal outfile= "D:\stata data\input\esg_proposal.dta"
148  replace;
149  run;

NOTE: The export data set has 5999 observations and 17 variables.
NOTE: "D:\stata data\input\esg_proposal.dta" file was successfully created.
NOTE: PROCEDURE EXPORT used (Total process time):
      real time           0.03 seconds
      cpu time            0.01 seconds
      
-------------------------------------------------------------------------------

. /*===========================================================================
> ======
>  *               Generate variables associated with SRI proposals            
>      *
>  ============================================================================
> =====*/
. 
. global input "/Volumes/T9/stata data/input"

. global wip "/Volumes/T9/stata data/wip"

. global output "/Volumes/T9/stata data/output"

. 
. set more off

. set type double

. 
. *1.Calculate the number of SRI proposals faced by each firm
. use "$input/esg_proposal.dta", clear

. *Keep shareholder proposals that were either voted on or withdrawn
. keep if other_status=="voted"|other_status=="withdrawn"
(1,998 observations deleted)

. sort gvkey year id

. *Count the number of SRI proposals faced by each firm
. by gvkey year: gen esproposal_number=_N

. duplicates drop gvkey year, force

Duplicates in terms of gvkey year

(1,278 observations deleted)

. destring gvkey, replace
gvkey: all characters numeric; replaced as long

. keep gvkey year esproposal_number

. save "$wip/esg_proposal_.dta", replace
file /Volumes/T9/stata data/wip/esg_proposal_.dta saved

. 
. *2.Calculate the average number of SRI proposals faced by key investee firms 
> in the portfolios of large shareholders
. use "$input/ownership_final_block.dta", clear

. drop if missing(mgrno)
(466,217 observations deleted)

. gen year=year(rdate)

. merge 1:1 mgrno gvkey rdate using "$input/ownership_final.dta", keepusing(ior
> _b)

    Result                      Number of obs
    -----------------------------------------
    Not matched                    59,262,692
        from master                         0  (_merge==1)
        from using                 59,262,692  (_merge==2)

    Matched                         1,790,851  (_merge==3)
    -----------------------------------------

. keep if _merge==3
(59,262,692 observations deleted)

. drop _merge

. * Keep key investee firms held by these large shareholders
. keep if ior_b>=0.0025
(1,676,902 observations deleted)

. destring gvkey, replace
gvkey: all characters numeric; replaced as long

. merge m:m gvkey year using "$wip/esg_proposal_.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                        96,095
        from master                    94,901  (_merge==1)
        from using                      1,194  (_merge==2)

    Matched                            19,048  (_merge==3)
    -----------------------------------------

. keep if _merge==3
(96,095 observations deleted)

. drop _merge

. sort mgrno rdate

. *Calculate the weighted average number of SRI proposals for each large shareh
> older
. by mgrno rdate: egen num = total(mv * esproposal_number * !missing(mv, esprop
> osal_number))

. by mgrno rdate: egen den = total(mv * !missing(mv, esproposal_number))

. gen portfolio_esproposal = num/den

. duplicates drop mgrno rdate, force

Duplicates in terms of mgrno rdate

(18,183 observations deleted)

. sort mgrno year rdate

. by mgrno year: egen pesproposal=mean(portfolio_esproposal)

. duplicates drop mgrno year, force

Duplicates in terms of mgrno year

(637 observations deleted)

. *SRI proposal frequencywhether the average number of SRI proposals faced by 
> important investee firms in the large shareholders' portfolios is above or be
> low the median
. egen pesproposal_median=median(pesproposal)

. gen more_proposals=1 if pesproposal>pesproposal_median
(114 missing values generated)

. replace more_proposals=0 if more_proposals==.
(114 real changes made)

. gen less_proposals=1 if pesproposal<pesproposal_median
(114 missing values generated)

. replace less_proposals=0 if less_proposals==.
(114 real changes made)

. keep mgrno mgrname more_proposals less_proposals  year

. save "$output/portfolio_proposal.dta", replace
file /Volumes/T9/stata data/output/portfolio_proposal.dta saved

-------------------------------------------------------------------------------

. /*===========================================================================
> ========
>  *                          Construct ESG incident variables                 
>        *
>  ============================================================================
> =======*/
. 
. global input "/Volumes/T9/stata data/input"

. global wip "/Volumes/T9/stata data/wip"

. global output "/Volumes/T9/stata data/output"

. 
. set more off

. set type double

. 
. *reprisk_news is negative ESG news coverage from RepRisk
. use "$input/reprisk_news.dta", clear

. duplicates drop gvkey news_date reprisk_substory_id, force 

Duplicates in terms of gvkey news_date reprisk_substory_id

(128,616 observations deleted)

. 
. *Classify the news into specific RepRisk issue types under four broad areas: 
> environment, social, governance, and cross-cutting issues
. *Environment
. gen Animal_mistreatment =  strpos(related_issues, "Animal mistreatment") > 0

. gen Climate_change = strpos(related_issues, "Climate change") > 0

. gen Impacts_on_landscape = strpos(related_issues, "Impacts on landscapes") > 
> 0

. gen Local_pollution = strpos(related_issues, "Local pollution") > 0

. gen Overuse = strpos(related_issues, "Overuse") > 0

. gen Waste = strpos(related_issues, "Waste") > 0

. gen Other_environmental_issues = strpos(related_issues, "Other environmental 
> issues") > 0

. 
. *Social
. gen Child = strpos(related_issues, "Child") > 0

. gen Discrimination = strpos(related_issues, "Discrimination") > 0

. gen Forced = strpos(related_issues, "Forced") > 0

. gen Freedom = strpos(related_issues, "Freedom") > 0

. gen Human = strpos(related_issues, "Human") > 0

. gen Impacts_on_communities = strpos(related_issues, "Impacts on communities")
>  > 0

. gen Local_participation = strpos(related_issues, "Local participation") > 0

. gen Occupational = strpos(related_issues, "Occupational") > 0

. gen Poor = strpos(related_issues, "Poor") > 0

. gen Social = strpos(related_issues, "Social") > 0

. gen Other_social_issues = strpos(related_issues, "Other social issues") > 0

. 
. *Governance
. gen Anti = strpos(related_issues, "Anti") > 0

. gen Corruption = strpos(related_issues, "Corruption") > 0

. gen Executive = strpos(related_issues, "Executive") > 0

. gen Fraud = strpos(related_issues, "Fraud") > 0

. gen Misleading = strpos(related_issues, "Misleading") > 0

. gen Tax_evasion = strpos(related_issues, "Tax evasion") > 0

. gen Tax_optimization = strpos(related_issues, "Tax optimization") > 0

. 
. *Cross-cutting issues
. gen Controversial = strpos(related_issues, "Controversial") > 0

. gen Products = strpos(related_issues, "Products") > 0

. gen Supply = strpos(related_issues, "Supply") > 0

. gen Violation_international = strpos(related_issues, "Violation of internatio
> nal standards") > 0

. gen Violation_national = strpos(related_issues, "Violation of national legisl
> ation") > 0

. 
. *Drop observations with missing ESG issue classifications
. gen check=Animal_mistreatment+Climate_change+Impacts_on_landscape+Local_pollu
> tion+Overuse+Waste+Child+Discrimination+Forced+Freedom+Human+Impacts_on_commu
> nities+Local_participation+Occupational+Poor+Social+Anti+Corruption+Executive
> +Fraud+Misleading+Tax_evasion+Tax_optimization+ Controversial+Products+Supply
> +Violation_international+Violation_national+Other_environmental_issues+Other_
> social_issues

. keep if check>=1 
(108 observations deleted)

. drop check

. 
. *Assigns each issue to one of the three ESG dimensions: Environmental (E), So
> cial (S), or Governance (G)
. generate Environment_news=1 if  Animal_mistreatment==1 | Climate_change==1| I
> mpacts_on_landscape==1| Local_pollution==1| Overuse==1| Waste==1|Other_enviro
> nmental_issues==1
(157,898 missing values generated)

. replace Environment_news=0 if Environment_news==.
(157,898 real changes made)

. generate Social_news=1 if Child==1 |Discrimination==1 |Forced==1 |Freedom==1 
> |Human==1 |Impacts_on_communities==1 |Local_participation==1 |Occupational==1
>  |Poor==1 |Social==1|Other_social_issues==1
(119,869 missing values generated)

. replace Social_news=0 if Social_news==.
(119,869 real changes made)

. generate Governance_news=1 if Anti==1 |Corruption==1 |Executive==1 |Fraud==1 
> |Misleading==1 |Tax_evasion==1 |Tax_optimization==1
(133,817 missing values generated)

. replace Governance_news=0 if Governance_news==.
(133,817 real changes made)

. generate Cross_news=1 if Controversial==1 |Products==1 |Supply==1 |Violation_
> international==1 |Violation_national==1
(80,001 missing values generated)

. replace Cross_news=0 if Cross_news==.
(80,001 real changes made)

. 
. *Generate variables for ESG incident counts
. gen year=year(news_date)

. *Drops news items classified under cross-cutting issues; results remain robus
> t when these items are retained
. drop if Cross_news==1
(139,855 observations deleted)

. sort gvkey year 

. by gvkey year : gen esg_incident=_N

. by gvkey year:keep if _N==_n
(71,660 observations deleted)

. keep gvkey year esg_incident 

. destring gvkey, replace
gvkey: all characters numeric; replaced as long

. save "$output/reprisk_esg_var.dta", replace
file /Volumes/T9/stata data/output/reprisk_esg_var.dta saved

-------------------------------------------------------------------------------

. /*===========================================================================
> ========
>  *                     Construct regulatory violation variables              
>        *
>  ============================================================================
> =======*/
. 
. global input "/Volumes/T9/stata data/input"

. global wip "/Volumes/T9/stata data/wip"

. global output "/Volumes/T9/stata data/output"

. 
. set more off

. set type double

. 
. *1.Import Violation Tracker basic information
. import excel "/Volumes/T9/Violation Tracker/ViolationTracker_basic_19mar21.xl
> sx", sheet("Sheet1") firstrow clear
(35 vars, 483,607 obs)

. *Keep only necessary variables
. keep pen_year company  unique_id parent_name penalty penalty_adjusted offense
> _group penalty_date  govt_level  case_category 

. order parent_name pen_year

. sort parent_name pen_year

. save "$wip/violation_basic.dta",replace
file /Volumes/T9/stata data/wip/violation_basic.dta saved

. 
. *2.Import Violation Tracker parent firm information
. import excel "/Volumes/T9/Violation Tracker/ViolationTracker_parents_19mar21.
> xlsx", sheet("Sheet1") firstrow clear
(15 vars, 3,350 obs)

. *Keep only necessary variables
. keep parent_name pen_total num_recs rank  ownership stock_ticker cik russell3
> 000 hq_country 

. *Filter 1: Keep observations with non-missing cik
. keep if cik!=.
(1,807 observations deleted)

. *Filter 2: Keep observations for publicly traded firms
. tab ownership

                              ownership |      Freq.     Percent        Cum.
----------------------------------------+-----------------------------------
 government sponsored & publicly traded |          1        0.06        0.06
                         privately held |          2        0.13        0.19
                        publicly traded |      1,540       99.81      100.00
----------------------------------------+-----------------------------------
                                  Total |      1,543      100.00

. keep if ownership=="publicly traded"
(3 observations deleted)

. save "$wip/parent_violation.dta",replace
file /Volumes/T9/stata data/wip/parent_violation.dta saved

. 
. *3.Classify violations into different types
. use "$wip/violation_basic.dta", clear

. merge m:1 parent_name using "$wip/parent_violation.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                       420,113
        from master                   420,113  (_merge==1)
        from using                          0  (_merge==2)

    Matched                            63,494  (_merge==3)
    -----------------------------------------

. keep if _merge==3
(420,113 observations deleted)

. drop _merge 

. * Following Raghunandan and Rajgopal (2022):
. * - Classify "employment" and "workplace safety" violations as labor issues;
. * - Classify "environment" violations as environmental issues;
. * - Classify "competition" and "consumer protection" violations as consumer i
> ssues;
. * - Classify all remaining violation types as "other".
. gen class1="environment" if offense_group=="environment-related offenses"
(51,439 missing values generated)

. replace class1="labor" if offense_group=="employment-related offenses"| offen
> se_group=="safety-related offenses"
(43,800 real changes made)

. replace class1="consumer" if offense_group=="competition-related offenses"| o
> ffense_group=="consumer-protection-related offenses"
(4,806 real changes made)

. replace class1="other" if class1==""
(2,833 real changes made)

. 
. *4.Construct violation indicator variables
. sort  parent_name pen_year class1

. duplicates drop parent_name pen_year class1, force

Duplicates in terms of parent_name pen_year class1

(44,875 observations deleted)

. save "$wip/violation_class1.dta", replace
file /Volumes/T9/stata data/wip/violation_class1.dta saved

. 
. use "$wip/violation_class1.dta", clear

. keep if class1=="environment"
(13,557 observations deleted)

. gen penalty_env_indicator=1

. keep parent_name pen_year penalty_env_indicator

. save "$wip/violation_env.dta", replace
file /Volumes/T9/stata data/wip/violation_env.dta saved

. 
. use "$wip/violation_class1.dta", clear

. keep if class1=="labor"
(8,833 observations deleted)

. gen penalty_labor_indicator=1

. keep parent_name pen_year penalty_labor_indicator

. save "$wip/violation_labor.dta", replace
file /Volumes/T9/stata data/wip/violation_labor.dta saved

. 
. use "$wip/violation_class1.dta", clear

. keep if class1=="consumer"
(16,478 observations deleted)

. gen penalty_consumer_indicator=1

. keep parent_name pen_year penalty_consumer_indicator

. save "$wip/violation_consumer.dta", replace
file /Volumes/T9/stata data/wip/violation_consumer.dta saved

. 
. use "$wip/violation_class1.dta", clear

. duplicates drop parent_name pen_year, force

Duplicates in terms of parent_name pen_year

(4,904 observations deleted)

. gen penalty_indicator=1

. keep parent_name pen_year penalty_indicator cik

. merge 1:1 parent_name pen_year using "$wip/violation_env.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                         8,653
        from master                     8,653  (_merge==1)
        from using                          0  (_merge==2)

    Matched                             5,062  (_merge==3)
    -----------------------------------------

. drop _merge

. merge 1:1 parent_name pen_year using "$wip/violation_labor.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                         3,929
        from master                     3,929  (_merge==1)
        from using                          0  (_merge==2)

    Matched                             9,786  (_merge==3)
    -----------------------------------------

. drop _merge

. merge 1:1 parent_name pen_year using "$wip/violation_consumer.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                        11,574
        from master                    11,574  (_merge==1)
        from using                          0  (_merge==2)

    Matched                             2,141  (_merge==3)
    -----------------------------------------

. drop _merge

. replace penalty_env_indicator=0 if penalty_env_indicator==.
(8,653 real changes made)

. replace penalty_labor_indicator=0 if penalty_labor_indicator==.
(3,929 real changes made)

. replace penalty_consumer_indicator=0 if penalty_consumer_indicator==.
(11,574 real changes made)

. *Incorporate gvkey into the regulatory violation data 
. *comp_gvkey_cik_link is a gvkey-cik linking table from Compustat
. duplicates drop cik pen_year, force

Duplicates in terms of cik pen_year

(0 observations are duplicates)

. rename pen_year year

. merge 1:1  year cik using "$input/comp_gvkey_cik_link.dta"
(variable year was int, now float to accommodate using data's values)

    Result                      Number of obs
    -----------------------------------------
    Not matched                       436,697
        from master                     1,142  (_merge==1)
        from using                    435,555  (_merge==2)

    Matched                            12,573  (_merge==3)
    -----------------------------------------

. keep if _merge==3
(436,697 observations deleted)

. drop _merge

. duplicates drop gvkey year, force

Duplicates in terms of gvkey year

(0 observations are duplicates)

. gen year_=year-1

. drop year

. rename year_ year

. destring gvkey, replace
gvkey: all characters numeric; replaced as long

. save "$output/violation_var.dta", replace
file /Volumes/T9/stata data/output/violation_var.dta saved

-------------------------------------------------------------------------------


      